In [None]:
!pip install numpy pandas scikit-learn xgboost



In [None]:
!unzip sentiment_analysis.zip

Archive:  sentiment_analysis.zip
  inflating: IMDB Dataset.csv        


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('IMDB Dataset.csv')


print(df.head())


X = df['review']
# Convert string labels to numerical labels
y = df['sentiment'].map({'negative': 0, 'positive': 1})  # Map 'negative' to 0 and 'positive' to 1


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


models = {
    "Naive Bayes Classifier": MultinomialNB(),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42)
}


for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))


print("\nSample output for visualization:")
sample_review = ["This movie was fantastic! The story was compelling and the characters were very well developed."]
sample_review_tfidf = vectorizer.transform(sample_review)
for name, model in models.items():
    prediction = model.predict(sample_review_tfidf)
    # Adjust prediction output to match numerical labels
    print(f"{name} prediction for sample review: {'positive' if prediction[0] == 1 else 'negative'}")

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Naive Bayes Classifier Accuracy: 0.87
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      4961
           1       0.88      0.85      0.86      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Random Forest Classifier Accuracy: 0.86
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      4961
           1       0.87      0.85      0.86      5039

    accuracy                           0.86     10000