# Resturant Review Sentiment - Light Gradient Boosting
### Matthew Newton
* Following from linear regresion, we can consider an ensemble decision tree appraoch
* Light Gradient Boosting is generall effective and efficinet, so should work well on the large dataset

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
import pickle

In [3]:
df_review = pd.read_pickle("./cleaned_data/reviews_cleaned_nltk.pickle")

In [4]:
# Split training data into train data, cross validation and test data
df_review = df_review.dropna()
features = ['text', 'title', 'type', 'priceInterval', 'date']
X_train, X_cv, y_train, y_cv = train_test_split(df_review[features], df_review['rating'], test_size = 0.30, random_state = 0)
X_cv, X_test, y_cv, y_test = train_test_split(X_cv[features], y_cv, test_size = 0.50, random_state = 0)

In [5]:
# Pipeline for text data (review and review title)
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50000))
])

# Pipeline for categorical data (restaurant type and price)
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for date data
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Combine all preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('review', text_pipeline, 'text'),
        ('title', text_pipeline, 'title'),
        ('type', categorical_pipeline, ['type']),
        ('price', categorical_pipeline, ['priceInterval']),
        ('num', numerical_pipeline, ['date'])
    ]
)

In [30]:
# Final pipeline combining preprocessing and LightGBM model
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lgbm', lgb.LGBMClassifier(objective='multiclass', num_class=5, random_state=0, num_leaves=200, n_estimators=100, max_depth=-1, learning_rate=0.1))
]) # class_weight='balanced',

# Train the model
model_pipeline.fit(X_train[:10000], y_train[:10000])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 15.482897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 923402
[LightGBM] [Info] Number of data points in the train set: 200000, number of used features: 14385
[LightGBM] [Info] Start training from score -2.639527
[LightGBM] [Info] Start training from score -2.776195
[LightGBM] [Info] Start training from score -2.191357
[LightGBM] [Info] Start training from score -1.315948
[LightGBM] [Info] Start training from score -0.720837


In [31]:
# Predict on validation set
y_cv_pred = model_pipeline.predict(X_cv)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_cv, y_cv_pred))
print("\nClassification Report:\n", classification_report(y_cv, y_cv_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_cv, y_cv_pred))

Accuracy: 0.6943243119834576

Classification Report:
               precision    recall  f1-score   support

           1       0.69      0.70      0.70     21763
           2       0.46      0.33      0.38     19260
           3       0.57      0.49      0.53     34492
           4       0.59      0.54      0.57     81838
           5       0.78      0.87      0.82    149253

    accuracy                           0.69    306606
   macro avg       0.62      0.59      0.60    306606
weighted avg       0.68      0.69      0.69    306606


Confusion Matrix:
 [[ 15241   3717   1708    584    513]
 [  4878   6361   6089   1336    596]
 [  1386   3368  17031  10458   2249]
 [   226    351   4370  44581  32310]
 [   205    114    652  18612 129670]]


In [32]:
# Predict on validation set
y_test_pred = model_pipeline.predict(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Accuracy: 0.6938644383997704

Classification Report:
               precision    recall  f1-score   support

           1       0.70      0.71      0.70     21548
           2       0.46      0.33      0.38     19290
           3       0.57      0.50      0.53     34129
           4       0.59      0.54      0.56     82029
           5       0.78      0.87      0.82    149610

    accuracy                           0.69    306606
   macro avg       0.62      0.59      0.60    306606
weighted avg       0.68      0.69      0.69    306606


Confusion Matrix:
 [[ 15205   3639   1676    548    480]
 [  4885   6340   6015   1415    635]
 [  1352   3423  16895  10248   2211]
 [   256    337   4473  44366  32597]
 [   171    117    581  18804 129937]]
