In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.utils import resample
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords

# Download the necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Define a list of stop words
stop_words = set(stopwords.words('english'))

# Load the Amazon reviews dataset
data_en = pd.read_pickle(r"C:\Users\phreb\Data\data_en.pickle")

# Combine the review headline and body into a single text column
data_en["Review"] = data_en["review_headline"] + " " + data_en["review_body"]

# Define the input and output data
X = data_en["Review"]
y = data_en["star_rating"]

# Upsample the minority classes to balance the dataset
rs = [resample(X[y == sr], y[y == sr], replace=False, n_samples=200000, random_state=123) for sr in [1,2,3,4,5]]
X_list = [rs[r][0] for r in range(5)]
y_list = [rs[r][1] for r in range(5)]
X_us = np.hstack(X_list)
y_us = np.hstack(y_list)

# Split the data into training, validation, and test sets
X_train, X_test_val, y_train, y_test_val = train_test_split(X_us, y_us, test_size= 0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, random_state=42)

# Tokenize the reviews into individual words and remove stop words
X_train_tokens = [[word for word in word_tokenize(review) if word.lower() not in stop_words] for review in X_train]
X_test_tokens = [[word for word in word_tokenize(review) if word.lower() not in stop_words] for review in X_test]

# Perform POS tagging on the tokenized reviews
X_train_pos = [pos_tag(tokens) for tokens in X_train_tokens]
X_test_pos = [pos_tag(tokens) for tokens in X_test_tokens]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\phreb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\phreb\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\phreb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train word2vec model
w2v_model = Word2Vec(X_train_tokens, vector_size=100, window=5, min_count=5, workers=4)
#This will create a word2vec model with 100-dimensional vectors, a window size of 5

# Create feature vectors
def create_features(review_tokens, model):
    num_features = model.vector_size
    features = np.zeros((len(review_tokens), num_features), dtype="float32")
    for i, tokens in enumerate(review_tokens):
        for token in tokens:
            if token in model.wv.key_to_index:
                features[i] += model.wv.get_vector(token)
    return features

X_train_features = create_features(X_train_tokens, w2v_model)
X_test_features = create_features(X_test_tokens, w2v_model)

# Train logistic regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_features, y_train)

# Make predictions on test set
y_pred = lr_model.predict(X_test_features)

# Evaluate model performance
print(classification_report(y_test, y_pred))

Trying with 200k Lines takes very long for my computer. Maybe we try on someone elses? Also I did not integrate our stop-words yet and could also try a version with -1 (1-2 stars), 0 (3stars) and 1 (4-5 stars) as target variable.. accuracy with 10k lines was at around 60% already, so I think it is a step in the right direction considering LR is a rather simple model for this task.