In [45]:
import pandas as pd

In [46]:
df = pd.read_csv('preprocessed_data.csv')
# multiply by 2 to get integer values
df['Rating'] = (df['Rating']*2).astype(int)
print(df['Rating'].value_counts())
df.head()

10    3826
8     2373
2     1735
6     1192
4      684
9       69
7       47
5       19
3        9
Name: Rating, dtype: int64


Unnamed: 0,Review,Rating,Reviewer_encode,nb_review,nb_follower,Year,Month
0,ambience good food quite good saturday lunch c...,10,4973,1,2,2019,5
1,ambience good pleasant even service prompt foo...,10,765,3,2,2019,5
2,must try great food great ambience thnx servic...,10,954,2,3,2019,5
3,soumen das arun great guy behavior sincerety g...,10,6591,1,1,2019,5
4,food goodwe order kodi drumstick basket mutton...,10,1616,3,2,2019,5


In [47]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px



X = df['Review'].astype(str)
y = df['Rating']

class Model:
    def __init__(self, X, y, model_architecture, vectorizer, random_seed=42, test_size=0.2) -> None:
        self.X = X
        self.y = y
        self.model_instance = model_architecture
        self.vectorizer = vectorizer
        self.random_seed = random_seed
        self.test_size = test_size

        self.pipeline = Pipeline([
        ("Vectorizer", self.vectorizer),
        ("Model_Architecture", self.model_instance)
        ])

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=self.test_size, random_state=self.random_seed)
    
    def fit(self):
        self.pipeline.fit(self.X_train, self.y_train)

    def predict(self):
        return self.pipeline.predict(self.X_test)

    
    def predict_proba(self):
        return self.pipeline.predict_proba(self.X_test)

    def report(self, class_labels):
        print(classification_report(self.y_test, self.predict(), target_names=class_labels))
        new_confusion_matrix = confusion_matrix(self.y_test, self.predict())
        fig = px.imshow(
            new_confusion_matrix, 
            text_auto=True, 
            title="Confusion Matrix", width=1000, height=800,
            labels=dict(x="Predicted", y="True Label"),
            x=class_labels,
            y=class_labels,
            color_continuous_scale='Blues'
            )
        fig.show()


In [48]:
model_multi_tfidf = Model(X, y, RandomForestClassifier(), TfidfVectorizer())

model_multi_tfidf.fit()

# retrieve the labels from the model because it's possible that some labels are not present in the test set
class_labels = [str(c) for c in model_multi_tfidf.y_test.unique()]
model_multi_tfidf.report(class_labels)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           6       0.62      0.84      0.72       347
          10       0.11      0.01      0.01       149
           8       0.00      0.00      0.00         5
           2       0.40      0.10      0.16       233
           4       0.00      0.00      0.00         7
           9       0.44      0.36      0.40       491
           7       0.00      0.00      0.00        14
           5       0.62      0.88      0.73       745

    accuracy                           0.58      1991
   macro avg       0.28      0.27      0.25      1991
weighted avg       0.51      0.58      0.52      1991

