### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
from utils.data import load_file
from utils.metrics import evaluate
import warnings 
warnings.filterwarnings('ignore')

### Load Data

In [2]:
df = pd.read_csv("../data/review_2022_clean.csv")
df.head()

Unnamed: 0,text,label
0,bought fender telecaster salesperson told orig...,negative
1,visit son restaurant desert home food authenti...,positive
2,danielle great job listened cut hair way reque...,positive
3,saw lot roaches bathroom woke bed large dark s...,negative
4,ordered pork fried rice beef chow mei fun teri...,negative


In [3]:
df_sw = pd.read_csv("../data/review_2022_clean_sw.csv")
df_sw.head()

Unnamed: 0,text,label
0,i bought a fender telecaster that the salesper...,negative
1,this is our go to for take out when i visit my...,positive
2,danielle did a great job she listened and cut ...,positive
3,we saw a lot of roaches in the bathroom when w...,negative
4,we ordered pork fried rice and beef chow mei f...,negative


### Train-Test Split

Create a function to perform train-test split.

In [4]:
def split(stop_words=False):
  if stop_words:
    X = df_sw["text"]
    y = df_sw["label"]
  else:
    X = df["text"]
    y = df["label"]
  return train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train, X_test, y_train, y_test = split()

In [6]:
X_train_sw, X_test_sw, y_train_sw, y_test_sw = split(stop_words=True)

In [7]:
y_train = load_file("y_train.pickle")

X_train_uni = load_file("X_train_uni.pickle")
X_test_uni = load_file("X_test_uni.pickle")

X_train_uni_sw = load_file("X_train_uni_sw.pickle")
X_test_uni_sw = load_file("X_test_uni_sw.pickle")

X_train_bi = load_file("X_train_bi.pickle")
X_test_bi = load_file("X_test_bi.pickle")

X_train_bi_sw = load_file("X_train_bi_sw.pickle")
X_test_bi_sw = load_file("X_test_bi_sw.pickle")

X_train_uni_bi = load_file("X_train_uni_bi.pickle")
X_test_uni_bi = load_file("X_test_uni_bi.pickle")

X_train_uni_bi_sw = load_file("X_train_uni_bi_sw.pickle")
X_test_uni_bi_sw = load_file("X_test_uni_bi_sw.pickle")

### Modeling

Define parameter settings to try

In [8]:
param_grid = {
    'C'       : np.logspace(-2, 2, 5),
    #'penalty' : ['l1','l2'],
    #'solver'  : ['newton-cg', 'lbfgs', 'liblinear'],
}

Create a function to create a new instance of grid search with 10-fold cross-validation.

In [9]:
def create_grid_search():
  model = LogisticRegression(class_weight="balanced", random_state=42, max_iter=10000)
  grid_search = GridSearchCV(model,                                          # model
                             param_grid=param_grid,                          # hyperparameters
                             cv=10,                                          # number of folds
                             scoring="balanced_accuracy")                    # metric for scoring
  return grid_search

In [12]:
def train_evaluate(X_train, X_test):
  grid_search = create_grid_search()
  grid_search.fit(X_train, y_train)
  
  print("Best Parameters:")
  print(grid_search.best_params_)

  y_pred = grid_search.predict(X_test)
  evaluate(y_pred)
  
  print("Classification Report:")
  print(classification_report(y_test, y_pred))

  print("Confusion Matrix:")
  ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
  plt.show()

  return search

#### Unigrams

In [11]:
train_evaluate(X_train_uni, X_test_uni)

KeyboardInterrupt: 

#### Unigrams + Stop Words

In [15]:
train_evaluate(X_train_uni_sw, X_test_uni_sw)

KeyboardInterrupt: 

#### Bigrams

In [96]:
train_evaluate(X_train_bi, X_test_bi)

#### Bigrams + Stop Words

In [None]:
train_evaluate(X_train_bi_sw, X_test_bi_sw)

#### Unigrams + Bigrams

In [None]:
train_evaluate(X_train_uni_bi, X_test_uni_bi)

#### Unigrams + Bigrams + Stop Words

In [None]:
train_evaluate(X_train_uni_bi_sw, X_test_uni_bi_sw)

#### Sanity Check

Positive review?

In [None]:
model_uni_bi_sw.predict(["The food is great! Especially the unagi!"])

Negative review?

In [None]:
model_uni_bi_sw.predict(["The service sucks! I will never come back again!"])

Neutral review?

In [None]:
model_uni_bi_sw.predict(["The food is decent but the price is a bit high."])

Sarcastic review?

In [None]:
model_uni_bi_sw.predict(["The food is so good that I think you need next level of taste buds to appreciate it."])

Spam review?

In [None]:
model_uni_bi_sw.predict(["Personal loan with low interest - call 0123456789."])

In [None]:
model_uni_bi_sw.predict(["Personal loan with quick approval - call 0123456789."])

Random text?

In [None]:
model_uni_bi_sw.predict(["Market is bullish."])

In [None]:
model_uni_bi_sw.predict(["Market is bullish. Forgot what's the next part of the sentence already."])

Emoji?

In [None]:
model_uni_bi_sw.predict(["🙂🙂🙂"])

In [None]:
model_uni_bi_sw.predict(["🤬🤬🤬"])