(Need to downgrade numpy to install scikit learn version 1.4.2 - required version for Streamlit)

In [None]:
# 1. Downgrade numpy to a compatible version
!pip install numpy==1.26.4 --force-reinstall

# 2. Reinstall scikit-learn with compatible version
!pip install scikit-learn==1.4.2 --force-reinstall

# 3. Restart the kernel (Colab tip: this needs manual action)

Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.5
    Uninstalling numpy-2.2.5:
      Successfully uninstalled numpy-2.2.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but yo

In [None]:
#Confirm version
import sklearn
print(sklearn.__version__)


1.4.2


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# Data Overview

In [None]:
# Load data
dataset = pd.read_csv("Restaurant_Reviews.tsv", sep='\t')


In [None]:
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB
None


In [None]:
print(dataset['Liked'].value_counts())

Liked
1    500
0    500
Name: count, dtype: int64


Findings
* The dataset contains 2 variables, a Review column and a Liked column (binary) to indicate if the review was good or bad.
* There are a total of 1000 records, 500 bad and 500 good reviews. No class imbalance observed.

# Data Pre-processing

In [None]:
# Preprocess
vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english',
                             strip_accents='ascii',
                             max_df=0.999)
X = vectorizer.fit_transform(dataset['Review']).toarray()
y = dataset['Liked']

Splitting dataset

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Random Forest Model

In [None]:
# Train model
model = RandomForestClassifier(n_estimators=10, random_state=123)
model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.76      0.71        95
           1       0.75      0.66      0.70       105

    accuracy                           0.70       200
   macro avg       0.71      0.71      0.70       200
weighted avg       0.71      0.70      0.70       200



# Voting Classifier Model

In [None]:
from sklearn.ensemble import VotingClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

model1 = MultinomialNB()
model2 = LogisticRegression(max_iter=1000)
model3 = LinearSVC()

ensemble = VotingClassifier(estimators=[
    ('nb', model1),
    ('lr', model2),
    ('svc', model3)
], voting='hard')

ensemble.fit(X_train, y_train)
from sklearn.metrics import classification_report

#evaluate
y_pred = ensemble.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.75      0.79        95
           1       0.79      0.88      0.83       105

    accuracy                           0.81       200
   macro avg       0.82      0.81      0.81       200
weighted avg       0.82      0.81      0.81       200





# Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(X_train, y_train)

#evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.67      0.74        95
           1       0.74      0.86      0.80       105

    accuracy                           0.77       200
   macro avg       0.78      0.77      0.77       200
weighted avg       0.78      0.77      0.77       200





# Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

#evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.65      0.75        95
           1       0.75      0.92      0.83       105

    accuracy                           0.80       200
   macro avg       0.82      0.79      0.79       200
weighted avg       0.81      0.80      0.79       200



# Model Comparison

1. Random Forest
* Accuracy: 0.70
* F1-score:
 * Class 0: 0.71
 * Class 1: 0.70
* Weakest model — less balanced, lower performance on both classes

2. Voting Classifier
* Accuracy: 0.81
* F1-score:
 * Class 0: 0.79
 * Class 1: 0.83
* Top performer overall

3. Support Vector Machine (SVM)
* Accuracy: 0.77
* F1-score:
 * Class 0: 0.74
 * Class 1: 0.80
* Strong recall for class 1, but not as balanced as Voting Classifier

4. Multinomial Naive Bayes
* Accuracy: 0.80
* F1-score:
 * Class 0: 0.75
 * Class 1: 0.83
* Close second — great for class 1, weaker on class 0



In conclusion, while all models demonstrate reasonable performance, the Voting Classifier stands out as the most balanced and accurate, making it the best choice for the final model. It consistently achieves high scores across precision, recall, and F1 for both classes, indicating strong generalization and reliability.

# Saving the Model

In [19]:
# Save the model and vectorizer
joblib.dump(ensemble, 'sentiment_model_1.pkl')
joblib.dump(vectorizer, 'vectorizer_1.pkl')

['vectorizer_1.pkl']