In [10]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
# Load the dataset
imdb_data = pd.read_csv("review_sent.csv")
print("Dataset Loaded")
print("Dataset shape:", imdb_data.shape)
print("First few rows of the dataset:\n", imdb_data.head())

Dataset Loaded
Dataset shape: (50000, 2)
First few rows of the dataset:
                                               review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [12]:
# Clean the review text
def clean_text(text):
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return text
imdb_data['clean_review'] = imdb_data['review'].apply(clean_text)
print("\nSample review before cleaning:", imdb_data['review'][0])
print("Sample review after cleaning:", imdb_data['clean_review'][0])


Sample review before cleaning: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appe

In [13]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
features = tfidf_vectorizer.fit_transform(imdb_data['clean_review'])
print("\nTF-IDF Vectorization complete")
print("Shape of the TF-IDF matrix:", features.shape)


TF-IDF Vectorization complete
Shape of the TF-IDF matrix: (50000, 5000)


In [14]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(features, imdb_data['sentiment'], test_size=0.2, random_state=42)
print("\nTrain-Test Split complete")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Train-Test Split complete
X_train shape: (40000, 5000)
X_test shape: (10000, 5000)
y_train shape: (40000,)
y_test shape: (10000,)


In [15]:
# Logistic Regression Model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

In [16]:
# Naive Bayes Model
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

In [17]:
# Making predictions
logistic_predictions = logistic_model.predict(X_test)
naive_bayes_predictions = naive_bayes_model.predict(X_test)

In [20]:
# Extracting Coefficients
feature_names = tfidf_vectorizer.get_feature_names_out()  # Updated method
coefficients = logistic_model.coef_.flatten()

# Combining coefficients with feature names
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sorting by absolute coefficients to see the most influential words
top_influential_words = feature_importance.reindex(feature_importance.Coefficient.abs().sort_values(ascending=False).index)

# Display top 20 words
print(top_influential_words.head(20))


             Feature  Coefficient
4942           worst    -9.417322
345            awful    -7.310270
515           boring    -6.866844
4824           waste    -6.865693
1509       excellent     6.759788
352              bad    -6.745389
1978           great     6.744836
3387            poor    -5.731439
4451        terrible    -5.642180
1351            dull    -5.274667
3388          poorly    -5.129762
4941           worse    -5.077302
432             best     5.075981
4915       wonderful     5.017101
207          amazing     4.962230
1230  disappointment    -4.935761
3288         perfect     4.903917
2181        horrible    -4.746227
553        brilliant     4.642281
1229   disappointing    -4.598668


In [21]:
# Evaluation
evaluation = {
    'Model': ['Logistic Regression', 'Naive Bayes'],
    'Accuracy': [accuracy_score(y_test, logistic_predictions), accuracy_score(y_test, naive_bayes_predictions)],
    'Precision': [precision_score(y_test, logistic_predictions, pos_label='positive'), precision_score(y_test, naive_bayes_predictions, pos_label='positive')],
    'Recall': [recall_score(y_test, logistic_predictions, pos_label='positive'), recall_score(y_test, naive_bayes_predictions, pos_label='positive')],
    'F1-Score': [f1_score(y_test, logistic_predictions, pos_label='positive'), f1_score(y_test, naive_bayes_predictions, pos_label='positive')]
}

evaluation_df = pd.DataFrame(evaluation)
print(evaluation_df)


                 Model  Accuracy  Precision    Recall  F1-Score
0  Logistic Regression    0.8900   0.879261  0.906132  0.892494
1          Naive Bayes    0.8541   0.845426  0.869419  0.857255


In [22]:
# BoW vectorization
count_vectorizer = CountVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
bow_features = count_vectorizer.fit_transform(imdb_data['clean_review'])

# Splitting the dataset for BoW features
X_train_bow, X_test_bow, _, _ = train_test_split(bow_features, imdb_data['sentiment'], test_size=0.2, random_state=42)

# Training Multinomial Naive Bayes for BoW features
mnb_bow = MultinomialNB()
mnb_bow.fit(X_train_bow, y_train)

# Predictions for BoW features
mnb_bow_predict = mnb_bow.predict(X_test_bow)

# Evaluating the model for BoW features
print("BoW Features Evaluation:")
print("Accuracy:", accuracy_score(y_test, mnb_bow_predict))
print("Classification Report:\n", classification_report(y_test, mnb_bow_predict, target_names=['Positive','Negative']))
print("Confusion Matrix:\n", confusion_matrix(y_test, mnb_bow_predict, labels=['positive', 'negative']))

BoW Features Evaluation:
Accuracy: 0.8456
Classification Report:
               precision    recall  f1-score   support

    Positive       0.85      0.84      0.84      4961
    Negative       0.84      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
 [[4293  746]
 [ 798 4163]]
