# Resturant Review Sentiment - Linear Classification
### Matthew Newton
* Trying simple machine learning models as a first attempt is useful to get an idea of performance.
* Linear regression model with a large feature set is usually a good start.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import pickle

In [2]:
df_review = pd.read_pickle("./cleaned_data/reviews_cleaned_nltk.pickle")

In [3]:
# Split training data into train data, cross validation and test data
df_review = df_review.dropna()
features = ['text', 'title', 'type', 'priceInterval', 'date']
X_train, X_cv, y_train, y_cv = train_test_split(df_review[features], df_review['rating'], test_size = 0.30, random_state = 0)
X_cv, X_test, y_cv, y_test = train_test_split(X_cv[features], y_cv, test_size = 0.50, random_state = 0)

In [4]:
# Pipeline for text data (review and review title)
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50000))
])

# Pipeline for categorical data (restaurant type and price)
categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for date data
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Combine all preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('review', text_pipeline, 'text'),
        ('title', text_pipeline, 'title'),
        ('type', categorical_pipeline, ['type']),
        ('price', categorical_pipeline, ['priceInterval']),
        ('num', numerical_pipeline, ['date'])
    ]
)

In [None]:
# Final pipeline combining preprocessing and Logistic Regression
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(multi_class='multinomial', solver='saga', penalty='l2', C=0.1)) #max_iter=1000, 
]) #, class_weight='balanced'

# Train the model
model_pipeline.fit(X_train, y_train)

In [None]:
# Predict on validation set
y_cv_pred = model_pipeline.predict(X_cv)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_cv, y_cv_pred))
print("\nClassification Report:\n", classification_report(y_cv, y_cv_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_cv, y_cv_pred))

In [None]:
# Predict on validation set
y_test_pred = model_pipeline.predict(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_test_pred))