In [25]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import plotly.express as px
from sklearn.model_selection import GridSearchCV
import warnings

In [26]:
df = pd.read_csv('preprocessed_data.csv')
df.head()

X = df['Review'].astype(str)
y = df['Rating']

In [27]:
dict_models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'Naive Bayes': MultinomialNB(),
    'KNN': KNeighborsClassifier()
}

dict_vectorizers = {
    'Count Vectorizer': CountVectorizer(),
    'TF-IDF Vectorizer': TfidfVectorizer()
}

In [28]:
# search the best model
best_model = None
best_vectorizer = None
best_score = 0

warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for model_name, model in dict_models.items():
    for vectorizer_name, vectorizer in dict_vectorizers.items():
        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('model', model)
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        score = pipeline.score(X_test, y_test)
        print(f'{model_name} with {vectorizer_name} score: {score}')
        if score > best_score:
            best_score = score
            best_model = model_name
            best_vectorizer = vectorizer_name

print(f'Best model: {best_model} with {best_vectorizer} score: {best_score}')


Logistic Regression with Count Vectorizer score: 0.5967330270546197
Logistic Regression with TF-IDF Vectorizer score: 0.6248085758039816
Random Forest with Count Vectorizer score: 0.5860132720775906
Random Forest with TF-IDF Vectorizer score: 0.5957120980091883
Gradient Boosting with Count Vectorizer score: 0.5696784073506891
Gradient Boosting with TF-IDF Vectorizer score: 0.5706993363961205
Decision Tree with Count Vectorizer score: 0.50944359367024
Decision Tree with TF-IDF Vectorizer score: 0.4742215416028586
SVM with Count Vectorizer score: 0.597753956100051
SVM with TF-IDF Vectorizer score: 0.6232771822358346
Naive Bayes with Count Vectorizer score: 0.6064318529862175
Naive Bayes with TF-IDF Vectorizer score: 0.5456865747830526
KNN with Count Vectorizer score: 0.4869831546707504
KNN with TF-IDF Vectorizer score: 0.3394589076059214
Best model: Logistic Regression with TF-IDF Vectorizer score: 0.6248085758039816


We can see that the best model here is the logistic regression with the TF-IDF vectorizer which gives a score of 0.6248.
Now that we know which model is the best, we can search the best hyperparameters for this model.
To avoid overfitting, we will use cross-validation to find the best hyperparameters even if that means that the score will be lower than the one we got here.

In [29]:
# dictionary of models to be tested with their respective parameters
param_grid = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [1, 5, 10],
            'penalty': ['l1', 'l2'],
            'max_iter': [100, 200, 300]
        }
    }
}

In [30]:
best_model_overall = None
best_params_overall = None
best_score_overall = 0

X = TfidfVectorizer().fit_transform(X)

model_instance = param_grid['Logistic Regression']["model"]
parameters = param_grid['Logistic Regression']["params"]

# apply grid search to find the best hyperparameters
# utilization of the cross validation technique to avoid overfitting
grid_search = GridSearchCV(model_instance, parameters, cv=10)
grid_search.fit(X, y)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

if best_score > best_score_overall:
    best_model_overall = best_model
    best_params_overall = best_params
    best_score_overall = best_score

print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Model: LogisticRegression(C=1, max_iter=200)
Best Parameters: {'C': 1, 'max_iter': 200, 'penalty': 'l2'}
Best Score: 0.6180576806820787


Thanks to Gridsearch we have our final model with the best hyperparameters that are the following:
Best Parameters: {'C': 1, 'max_iter': 200, 'penalty': 'l2'}

In [31]:
# confusion matrix and classification report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(C=1, max_iter=200, penalty='l2', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

new_confusion_matrix = confusion_matrix(y_test, y_pred)

class_labels = ['1', '2', '3', '4', '5']

fig = px.imshow(
            new_confusion_matrix, 
            text_auto=True, 
            title="Confusion Matrix", width=1000, height=800,
            labels=dict(x="Predicted", y="True Label"),
            x=class_labels,
            y=class_labels,
            color_continuous_scale='Blues'
            )
fig.show()

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.70      0.82      0.75       351
         2.0       0.12      0.01      0.03       134
         3.0       0.47      0.26      0.33       231
         4.0       0.51      0.48      0.50       481
         5.0       0.68      0.84      0.75       762

    accuracy                           0.62      1959
   macro avg       0.49      0.48      0.47      1959
weighted avg       0.58      0.62      0.59      1959



We can see that we have good F1 scores for the extreme classes, less for the neutral classes and more especialy for the class which have just a few samples.