In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import resample
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords

# Load the Amazon reviews dataset
data_en = pd.read_pickle(r"C:\Users\A9236\Data\data_en2.pickle")

# Define the input and output data
X = data_en["lem_pos_ner_rem"]
y = data_en["star_rating"]

# Upsample the minority classes to balance the dataset
rs = [resample(X[y == sr], y[y == sr], replace=False, n_samples=40000, random_state=123) for sr in [1,2,3,4,5]]
X_list = [rs[r][0] for r in range(5)]
y_list = [rs[r][1] for r in range(5)]
X_us = np.hstack(X_list)
y_us = np.hstack(y_list)

# Split the data into training, validation, and test sets
X_train, X_test_val, y_train, y_test_val = train_test_split(X_us, y_us, test_size= 0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, random_state=42)

print(X_us.shape)
print(y_us.shape)

(200000,)
(200000,)


In [2]:
data_en.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,language,processed_reviews,lem_pos_ner_rem
0,US,12039526,RTIS3L2M1F5SM,B001CXYMFS,737716809,Thrustmaster T-Flight Hotas X Flight Stick,Video Games,5,0,0,N,Y,an amazing joystick. I especially love that yo...,"Used this for Elite Dangerous on my mac, an am...",2015-08-31,EN,use elit danger mac amaz joystick especi love ...,use_VERB this_PRON for_ADP on_ADP my_PRON mac_...
1,US,9636577,R1ZV7R40OLHKD,B00M920ND6,569686175,Tonsee 6 buttons Wireless Optical Silent Gamin...,Video Games,5,0,0,N,Y,Definitely a silent mouse... Not a single clic...,"Loved it, I didn't even realise it was a gami...",2015-08-31,EN,love even realis game mous type silent mous se...,love_VERB it_PRON I_PRON do_AUX not_PART even_...
2,US,2331478,R3BH071QLH8QMC,B0029CSOD2,98937668,Hidden Mysteries: Titanic Secrets of the Fatef...,Video Games,1,0,1,N,Y,One Star,poor quality work and not as it is advertised.,2015-08-31,EN,poor qualiti advertis,poor_ADJ quality_NOUN work_NOUN and_CCONJ not_...
3,US,52495923,R127K9NTSXA2YH,B00GOOSV98,23143350,GelTabz Performance Thumb Grips - PlayStation ...,Video Games,3,0,0,N,Y,"good, but could be bettee","nice, but tend to slip away from stick in inte...",2015-08-31,EN,nice tend slip away stick intens hard press ga...,nice_ADJ but_CCONJ tend_VERB to_PART slip_VERB...
4,US,14533949,R32ZWUXDJPW27Q,B00Y074JOM,821342511,Zero Suit Samus amiibo - Japan Import (Super S...,Video Games,4,0,0,N,Y,Great but flawed.,"Great amiibo, great for collecting. Quality ma...",2015-08-31,EN,great amiibo great collect qualiti materi desi...,great_ADJ amiibo_NOUN great_ADJ for_ADP collec...


def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token.lower() not in stop_words]
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    return tokens

X_train_tok = [preprocess_text(text) for text in X_train]
X_val_tok = [preprocess_text(text) for text in X_val]
X_test_tok = [preprocess_text(text) for text in X_test]

### Word2Vec and Linear Regression Model

In [2]:
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train word2vec model
w2v_model = Word2Vec(X_train, vector_size=100, window=5, min_count=5, workers=4)
#This will create a word2vec model with 100-dimensional vectors, a window size of 5

# Create feature vectors
def create_features(review_tokens, model):
    num_features = model.vector_size
    features = np.zeros((len(review_tokens), num_features), dtype="float32")
    for i, tokens in enumerate(review_tokens):
        for token in tokens:
            if token in model.wv.key_to_index:
                features[i] += model.wv.get_vector(token)
    return features

X_train_features = create_features(X_train, w2v_model)
X_test_features = create_features(X_test, w2v_model)

In [3]:
from sklearn.ensemble import RandomForestClassifier

# Train random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_features, y_train)

# Make predictions on test set
y_pred = rf_model.predict(X_test_features)

# Evaluate model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.36      0.49      0.41      3945
           2       0.30      0.29      0.29      4006
           3       0.27      0.22      0.24      3925
           4       0.28      0.23      0.25      4000
           5       0.43      0.44      0.44      4124

    accuracy                           0.33     20000
   macro avg       0.33      0.33      0.33     20000
weighted avg       0.33      0.33      0.33     20000



### Decision Tree Tuning

In [4]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to tune
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4, 8, 16],
}

# Create a decision tree classifier object
dt_model = DecisionTreeClassifier()

# Create a GridSearchCV object
grid_search_dt = GridSearchCV(dt_model, param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_dt.fit(X_train_features, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', grid_search_dt.best_params_)

# Use the best hyperparameters to create the final model
dt_model_final = DecisionTreeClassifier(**grid_search_dt.best_params_)
dt_model_final.fit(X_train_features, y_train)

900 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\A9236\AppData\Local\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\A9236\AppData\Local\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\A9236\AppData\Local\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalti

Best hyperparameters: {'C': 100, 'max_iter': 5000, 'penalty': 'none', 'solver': 'sag'}




In [5]:
# Make predictions on test set
y_pred = dt_model_final.predict(X_test_features)

# Evaluate model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.60      0.74      0.66      3945
           2       0.45      0.35      0.40      4006
           3       0.42      0.30      0.35      3925
           4       0.42      0.40      0.41      4000
           5       0.56      0.73      0.63      4124

    accuracy                           0.51     20000
   macro avg       0.49      0.50      0.49     20000
weighted avg       0.49      0.51      0.49     20000



In [8]:
# Save logistic regression model
import pickle
with open('logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(lr_model_final, file)

In [6]:
#data_en.to_csv("path", compression='zip')
#use stopwords from Sebastian
#interpret results