In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [37]:
df = pd.read_csv("../Data/processed_data/cleaned_reviews.csv")

In [38]:
print(df.loc[34567:34572])

                                            cleaned_text sentiment
34567  definitely worst columbo ever dreamt murder ab...  negative
34568  title like know expect great horror movie real...  negative
34569  dont spend much time watching made tv movie se...  negative
34570  far richer texture character even classic geor...  positive
34571  film called adventure cinemascope like screenw...  negative
34572  premise movie simple script elderly muslim get...  positive


In [39]:
x = df['cleaned_text']
y = df['sentiment']

In [40]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=42, test_size=0.2)

In [41]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range= (1,2)
)

In [42]:
x_train_tfidf = tfidf.fit_transform(xtrain)
x_test_tfidf = tfidf.transform(xtest)

In [43]:
print(x_train_tfidf.shape)
print(x_test_tfidf.shape)

(40000, 5000)
(10000, 5000)


In [44]:
lb = LabelEncoder()

In [45]:
y_train_lb = lb.fit_transform(ytrain)
y_test_lb = lb.transform(ytest)

In [52]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Multinomial_NB': MultinomialNB(),
    'Linear_SVC': LinearSVC()
}

In [56]:
for model_name, model in models.items():
    print(model_name,":")
    train_model = model.fit(x_train_tfidf, y_train_lb)
    print(f"Train Score: {train_model.score(x_train_tfidf, y_train_lb)}")
    print(f"Test Score: {train_model.score(x_test_tfidf, y_test_lb)}")
    pred = train_model.predict(x_test_tfidf)
    print(f'Confussion Matrix:\n{confusion_matrix(y_test_lb, pred)}')
    print(f'Accuracy score:\n{accuracy_score(y_test_lb, pred)}')
    print("-------------------------")

Logistic Regression :
Train Score: 0.9113
Test Score: 0.8883
Confussion Matrix:
[[4339  622]
 [ 495 4544]]
Accuracy score:
0.8883
-------------------------
Multinomial_NB :
Train Score: 0.866875
Test Score: 0.8557
Confussion Matrix:
[[4176  785]
 [ 658 4381]]
Accuracy score:
0.8557
-------------------------
Linear_SVC :
Train Score: 0.93015
Test Score: 0.8819
Confussion Matrix:
[[4334  627]
 [ 554 4485]]
Accuracy score:
0.8819
-------------------------


Logistic Regression showed the best results compared to other models. Therefore, I've decided to move forward with Logistic Regression for this sentiment analysis task.

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 5, 10],
    'penalty': ['l1', 'l2'],
    'solver': 'l'
}