# Resturant Review Sentiment - Light Gradient Boosting Ensemble
### Matthew Newton
* An ensemble method combining linear classification and Gradient boosting could provide improved results.
* The text features will be parsed through a linear model and the categorical and numerical features will be parsed through the LightGBM algorithm.
* The linear classifier should capture the sentiment from the review directly and the LightGBM can handle the relationship between the different features.

In [1]:
import pandas as pd
import pickle
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import lightgbm as lgb

In [2]:
df_review = pd.read_pickle("./cleaned_data/reviews_cleaned_nltk.pickle")

In [3]:
# Split training data into train data, cross validation and test data
df_review = df_review.dropna()
features = ['text', 'title', 'type', 'priceInterval', 'date', 'review_length', 'rest_rating']
X_train, X_cv, y_train, y_cv = train_test_split(df_review[features], df_review['rating'], test_size = 0.30, random_state = 0)
X_cv, X_test, y_cv, y_test = train_test_split(X_cv[features], y_cv, test_size = 0.50, random_state = 0)

In [4]:
# Vectorize text data (review + review title) using TF-IDF
vectorizer = TfidfVectorizer(max_features=50000)
X_train_text = X_train['text'] + ' ' + X_train['title']
X_cv_text = X_cv['text'] + ' ' + X_cv['title']
X_test_text = X_test['text'] + ' ' + X_test['title']

# Fit and transform the training data, transform the validation data
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_cv_tfidf = vectorizer.transform(X_cv_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Train Linear Model
lin_model = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l2', C=2)
lin_model.fit(X_train_tfidf, y_train)

# Get predictions on both training and validation sets
train_lin_predictions = lin_model.predict(X_train_tfidf)
cv_lin_predictions = lin_model.predict(X_cv_tfidf)
test_lin_predictions = lin_model.predict(X_test_tfidf)

# Add these predictions as a new feature to the original datasets
X_train['lin_pred'] = train_lin_predictions
X_cv['lin_pred'] = cv_lin_predictions
X_test['lin_pred'] = test_lin_predictions



In [5]:
# Pipeline for categorical data (restaurant type and price)
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for date data
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Combine all preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('lin_pred', categorical_pipeline, ['lin_pred']),
        ('type', categorical_pipeline, ['type']),
        ('price', categorical_pipeline, ['priceInterval']),
        ('num', numerical_pipeline, ['date']),
        ('review_length', numerical_pipeline, ['review_length']),
        ('rest_rating', numerical_pipeline, ['rest_rating'])
    ]
)

In [10]:
# Final pipeline combining preprocessing and LightGBM model
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lgbm', lgb.LGBMClassifier(objective='multiclass', num_class=5, random_state=0, num_leaves=200, n_estimators=100, max_depth=-1, learning_rate=0.1))
]) # class_weight='balanced',

# Train the model
model_pipeline.fit(X_train[['lin_pred', 'type', 'priceInterval', 'date', 'review_length', 'rest_rating']], y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2238
[LightGBM] [Info] Number of data points in the train set: 1430828, number of used features: 862
[LightGBM] [Info] Start training from score -2.640850
[LightGBM] [Info] Start training from score -2.769315
[LightGBM] [Info] Start training from score -2.192273
[LightGBM] [Info] Start training from score -1.317572
[LightGBM] [Info] Start training from score -0.720422


In [11]:
# Predict on validation set
y_train_pred = model_pipeline.predict(X_train)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:\n", classification_report(y_train, y_train_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_train, y_train_pred))

Accuracy: 0.7324674943459312

Classification Report:
               precision    recall  f1-score   support

           1       0.75      0.79      0.77    102019
           2       0.58      0.47      0.52     89720
           3       0.64      0.60      0.62    159770
           4       0.64      0.57      0.60    383154
           5       0.80      0.88      0.84    696165

    accuracy                           0.73   1430828
   macro avg       0.68      0.66      0.67   1430828
weighted avg       0.72      0.73      0.73   1430828


Confusion Matrix:
 [[ 80858  13969   5508    984    700]
 [ 20049  41922  23622   3006   1121]
 [  5758  14463  95135  37563   6851]
 [   838   1109  22081 218617 140509]
 [   533    252   2650  81227 611503]]


In [12]:
# Predict on validation set
y_cv_pred = model_pipeline.predict(X_cv)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_cv, y_cv_pred))
print("\nClassification Report:\n", classification_report(y_cv, y_cv_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_cv, y_cv_pred))

Accuracy: 0.7051492795313855

Classification Report:
               precision    recall  f1-score   support

           1       0.70      0.74      0.72     21763
           2       0.49      0.39      0.43     19260
           3       0.59      0.55      0.57     34492
           4       0.61      0.54      0.57     81838
           5       0.79      0.87      0.83    149253

    accuracy                           0.71    306606
   macro avg       0.64      0.62      0.62    306606
weighted avg       0.69      0.71      0.70    306606


Confusion Matrix:
 [[ 16169   3799   1319    254    222]
 [  5136   7457   5713    693    261]
 [  1383   3761  18814   8897   1637]
 [   192    290   5203  44413  31740]
 [   169     68    592  19074 129350]]


In [None]:
# Predict on validation set
y_test_pred = model_pipeline.predict(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))