In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import resample
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords

# Download the necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Define a list of stop words
stop_words = set(stopwords.words('english'))

# Load the Amazon reviews dataset
data_en = pd.read_pickle(r"C:\Users\phreb\Data\data_en.pickle")

# Combine the review headline and body into a single text column
data_en["Review"] = data_en["review_headline"] + " " + data_en["review_body"]

# Define the input and output data
X = data_en["Review"]
y = data_en["star_rating"]

# Upsample the minority classes to balance the dataset
rs = [resample(X[y == sr], y[y == sr], replace=False, n_samples=40000, random_state=123) for sr in [1,2,3,4,5]]
X_list = [rs[r][0] for r in range(5)]
y_list = [rs[r][1] for r in range(5)]
X_us = np.hstack(X_list)
y_us = np.hstack(y_list)

# Split the data into training, validation, and test sets
X_train, X_test_val, y_train, y_test_val = train_test_split(X_us, y_us, test_size= 0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, random_state=42)

print(X_us.shape)
print(y_us.shape)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phreb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\phreb\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phreb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(200000,)
(200000,)


In [3]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token.lower() not in stop_words]
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    return tokens

X_train_tok = [preprocess_text(text) for text in X_train]
X_val_tok = [preprocess_text(text) for text in X_val]
X_test_tok = [preprocess_text(text) for text in X_test]

### Perform POS tagging on the tokenized reviews
> X_train_pos = [pos_tag(tokens) for tokens in X_train_tokens]

> X_test_pos = [pos_tag(tokens) for tokens in X_test_tokens]

### Word2Vec and Linear Regression Model

In [5]:
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train word2vec model
w2v_model = Word2Vec(X_train_tok, vector_size=100, window=5, min_count=5, workers=4)
#This will create a word2vec model with 100-dimensional vectors, a window size of 5

# Create feature vectors
def create_features(review_tokens, model):
    num_features = model.vector_size
    features = np.zeros((len(review_tokens), num_features), dtype="float32")
    for i, tokens in enumerate(review_tokens):
        for token in tokens:
            if token in model.wv.key_to_index:
                features[i] += model.wv.get_vector(token)
    return features

X_train_features = create_features(X_train_tok, w2v_model)
X_test_features = create_features(X_test_tok, w2v_model)

# Train logistic regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_features, y_train)

# Make predictions on test set
y_pred = lr_model.predict(X_test_features)

# Evaluate model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.59      0.76      0.67      3945
           2       0.45      0.33      0.38      4006
           3       0.41      0.35      0.38      3925
           4       0.45      0.34      0.39      4000
           5       0.54      0.74      0.62      4124

    accuracy                           0.51     20000
   macro avg       0.49      0.50      0.49     20000
weighted avg       0.49      0.51      0.49     20000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree Model

In [6]:
from sklearn.tree import DecisionTreeClassifier

# Train decision tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_features, y_train)

# Make predictions on test set
y_pred_dt = dt_model.predict(X_test_features)

# Evaluate model performance
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           1       0.44      0.45      0.45      3945
           2       0.29      0.29      0.29      4006
           3       0.27      0.27      0.27      3925
           4       0.31      0.31      0.31      4000
           5       0.46      0.45      0.45      4124

    accuracy                           0.35     20000
   macro avg       0.35      0.35      0.35     20000
weighted avg       0.36      0.35      0.35     20000



### Random Forest Model

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Train random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_features, y_train)

# Make predictions on test set
y_pred = rf_model.predict(X_test_features)

# Evaluate model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.55      0.68      0.61      3945
           2       0.38      0.40      0.39      4006
           3       0.36      0.32      0.34      3925
           4       0.42      0.39      0.41      4000
           5       0.63      0.58      0.60      4124

    accuracy                           0.47     20000
   macro avg       0.47      0.47      0.47     20000
weighted avg       0.47      0.47      0.47     20000



### Decision Tree Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to tune
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4, 8, 16],
}

# Create a decision tree classifier object
dt_model = DecisionTreeClassifier()

# Create a GridSearchCV object
grid_search_dt = GridSearchCV(dt_model, param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_dt.fit(X_train_features, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', grid_search_dt.best_params_)

# Use the best hyperparameters to create the final model
dt_model_final = DecisionTreeClassifier(**grid_search_dt.best_params_)
dt_model_final.fit(X_train_features, y_train)

### Logistic Regression Tuning

In [None]:
# Define the hyperparameters to tune
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 500, 1000, 5000],
}

# Create a logistic regression object
lr_model = LogisticRegression()

# Create a GridSearchCV object
grid_search_lr = GridSearchCV(lr_model, param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_lr.fit(X_train_features, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', grid_search_lr.best_params_)

# Use the best hyperparameters to create the final model
lr_model_final = LogisticRegression(**grid_search_lr.best_params_)
lr_model_final.fit(X_train_features, y_train)

### Random Forest Tuning

In [None]:
# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4, 8, 16],
}

# Create a random forest classifier object
rf_model = RandomForestClassifier()

# Create a GridSearchCV object
grid_search_rf = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_rf.fit(X_train_features, y_train)

# Print the best hyperparameters
print('Best hyperparameters:', grid_search_rf.best_params_)

In [None]:
#data_en.to_csv("path", compression='zip')
#use stopwords from Sebastian
#interpret results