In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

In [3]:
try:
    data = pd.read_csv('IMDB.csv', encoding='utf-8')
except pd.errors.ParserError:
    print("Error occurred while parsing the CSV file. Skipping problematic lines.")
    data = pd.read_csv('IMDB.csv', encoding='utf-8', error_bad_lines=False)


# Assuming 'sentiment' is the target variable, and 'review' is the text content.
# Modify the column names accordingly.
X = data['review']
y = data['sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
models = [
    make_pipeline(TfidfVectorizer(), MultinomialNB()),
    make_pipeline(TfidfVectorizer(), RandomForestClassifier()),
    make_pipeline(TfidfVectorizer(), KNeighborsClassifier()),
    make_pipeline(TfidfVectorizer(), XGBClassifier()),
    make_pipeline(TfidfVectorizer(), LogisticRegression()),
    make_pipeline(TfidfVectorizer(), DecisionTreeClassifier())
]

In [None]:
for i, model in enumerate(models):
    accuracy, report = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)

    print(f"Model {i + 1}:\nAccuracy: {accuracy:.2f}\nClassification Report:\n{report}\n{'='*50}\n")

Model 1:
Accuracy: 0.86
Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.89      0.87      4961
    positive       0.88      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000


