# Processing Amazon reviews for Sentiment Analysis
Data source: "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz"

The following has been conducted in the notebook:

1- Data Preprocessing

2- Transforming text into word2vector using google word2vec, and TF-IDF

3- Applying different machine learning models to predict positive, negative sentiment. 

**Best accuracy achieved 0.89 f1 score, with SVM TF-IDF**

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
pd.set_option('display.max_colwidth', None)
import requests
from io import BytesIO


url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz"
data_compressed = requests.get(url).content
data = pd.read_csv(BytesIO(data_compressed), compression='gzip', sep='\t', error_bad_lines=False, warn_bad_lines=False)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
data = pd.read_csv(BytesIO(data_compressed), compression='gzip', sep='\t', error_bad_lines=False, warn_bad_lines=False)

In [None]:
def review_num_words(txt):
    return len(txt.split())

In [None]:
data = data.dropna().reset_index(drop=True)
# reviews with more than 20 words
data['review_length'] = data['review_body'].apply(lambda body : review_num_words(body))

In [None]:
data_df = data.loc[data['review_length'] > 54]

In [None]:
# Filter out 150K reviews 
positive = data_df.loc[data['star_rating'] > 3].sample(n=50000, replace=False)
negative = data_df.loc[data['star_rating'] < 3].sample(n=50000, replace=False)
neutral = data_df.loc[data['star_rating'] == 3].sample(n=50000, replace=False)

In [None]:
reviews = positive.append(negative)
reviews = reviews.append(neutral)
reviews = reviews[['review_body', 'star_rating']]

In [None]:
reviews.count()

review_body    150000
star_rating    150000
dtype: int64

## Preprocessing Functions

In [None]:
# Remove HTML tags funtion
def remove_tags(txt):
    # parse html content
    soup = BeautifulSoup(txt, "html.parser")

    # get tags content
    for data in soup(['style', 'script']):
        data.get_text()

    # return html's tag content
    return ' '.join(soup.stripped_strings)
    
# Remove URLS funtion
import re
def remove_urls(txt):
  return re.sub(r"http\S+", "", txt)

!pip install contractions

# Apply contraction to words
import contractions
def contractionfunction(s):
  expanded_words = []
  for word in s.split():
    expanded_words.append(contractions.fix(word))

  result = ' '.join(expanded_words)   
  return result


def remove_non_alphabetical(txt):
    regex = re.compile('[\W_0-9]+')
    dirty_list = txt.split()
    clean_list = [regex.sub(' ', word) for word in dirty_list]
    clean_string = ' '.join(clean_list)
    return clean_string

from nltk.corpus import stopwords
nltk.download('stopwords')

# remove stop words function
def remove_stop_words(txt):
    stop = stopwords.words('english')
    word_list = txt.split()
    clean_list = []
    clean_string = ''
    for word in word_list:
      if word not in stop:
        clean_list.append(word)
    clean_string = ' '.join(clean_list)
    return clean_string

from nltk.stem import WordNetLemmatizer

def leammatize_review(txt):
  lemmatizer = WordNetLemmatizer()
  word_list = txt.split()
  clean_list = []
  clean_string = ''
  for word in word_list:
    new_word = lemmatizer.lemmatize(word)
    clean_list.append(new_word)
  clean_string = ' '.join(clean_list)
  return clean_string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
reviews['clean_body'] = reviews['review_body'].str.lower()

In [None]:
# Applying preproccessing
reviews['clean_body'] = reviews['clean_body'].apply(lambda body : remove_tags(body))
reviews['clean_body'] = reviews['clean_body'].apply(lambda body : remove_urls(body))
reviews['clean_body'] = reviews['clean_body'].apply(lambda body : contractionfunction(body))
reviews['clean_body'] = reviews['clean_body'].apply(lambda body : remove_non_alphabetical(body))
reviews['clean_body'] = reviews['clean_body'].apply(lambda review: remove_stop_words(review))
reviews['clean_body'] = reviews['clean_body'].apply(lambda txt: leammatize_review(txt))

In [None]:
# Classifying our project to positive and negative
reviews.loc[reviews['star_rating'] > 3, 'binary_rate'] = 1
reviews.loc[reviews['star_rating'] < 3, 'binary_rate'] = 0

reviews.loc[reviews['star_rating'] > 3, 'trinary_rate'] = 2
reviews.loc[reviews['star_rating'] == 3, 'trinary_rate'] = 1
reviews.loc[reviews['star_rating'] < 3, 'trinary_rate'] = 0

## Word Embedding

## Section (a): Google Word2Vec

In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [None]:
wv.most_similar(positive=['america', 'london'], negative=['washington'], topn=1)

[('england', 0.569724440574646)]

In [None]:
wv.most_similar(positive=['kitchen', 'farm'], negative=['chef'], topn=1)

[('dairy_farm', 0.5582115650177002)]

## Section(b): Training my Own Word2Vec

In [None]:
# Create list of words for every review
sent = [row.split() for row in reviews['clean_body']]

In [None]:
# Build the model
from gensim.models import Word2Vec

w2v_model = Word2Vec(min_count=10,
                     window=11,
                     size=300)

In [None]:
# Find phrases in our dataset such as New York to be made as one word
from gensim.models.phrases import Phrases, Phraser
phrases = Phrases(sent, min_count=10)
bigram = Phraser(phrases)
sentences = bigram[sent]

In [None]:
# Add our words and pharses to the model 
w2v_model.build_vocab(sentences)

In [None]:
# Train our model on our vocab
from time import time
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 5.8 mins


In [None]:
# Find nearset vector
w2v_model.most_similar(positive=['america', 'london'], negative=['washington'], topn=1)

  """Entry point for launching an IPython kernel.


[('germany', 0.41902074217796326)]

In [None]:
w2v_model.most_similar(positive=['kitchen', 'farm'], negative=['chef'], topn=1)

  """Entry point for launching an IPython kernel.


[('downstairs', 0.38579726219177246)]

# Simple models

In [None]:
# Trained word2vec
import numpy as np

indexs = reviews.index
reviews['word2vec'] = pd.Series(dtype=object)
for idx , review in zip(indexs, reviews['clean_body']):
  unseen_words = 0
  n = len(review.split())
  x = 0
  for word in review.split():
    try:
      x = x + w2v_model[word]
    except KeyError:
         unseen_words = unseen_words + 1
  if unseen_words == n:
    reviews.at[idx, 'word2vec'] = np.NaN
    continue
  x = x/(n-unseen_words)
  x1 = x.reshape(-1, 1)
  x1 = x1.T
  reviews.at[idx, 'word2vec'] = x1[0]

  if sys.path[0] == '':


In [None]:
# Google pre-trained word2vec
import numpy as np

indexs = reviews.index
reviews['google_word2vec'] = pd.Series(dtype=object)
for idx , review in zip(indexs, reviews['clean_body']):
  unseen_words = 0
  n = len(review.split())
  x = 0
  for word in review.split():
    try:
      x = x + wv[word]
    except KeyError:
         unseen_words = unseen_words + 1
  if unseen_words == n:
    reviews.at[idx, 'word2vec'] = np.NaN
    continue
  x = x/(n-unseen_words)
  x1 = x.reshape(-1, 1)
  x1 = x1.T
  reviews.at[idx, 'google_word2vec'] = x1[0]

### Dataset prepration

In [None]:
# Trained Word2Vec training and testing set
from sklearn.model_selection import train_test_split
# Drop the neutral reviews
binary_reviews = reviews[['clean_body','word2vec', 'binary_rate']].dropna()
# Create our X dataset
X_word2vec = np.array(binary_reviews['word2vec'].values.tolist())
y_word2vec = binary_reviews['binary_rate']
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(X_word2vec, y_word2vec, test_size=0.2, random_state=200)

In [None]:
# TFIDF training and testing set
from sklearn.feature_extraction.text import TfidfVectorizer

X_tfidf = binary_reviews['clean_body']
y_tfidf = binary_reviews['binary_rate']
X_train, X_test, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_tfidf, test_size=0.2, random_state=200)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Pretrained Google word2vec training and testing set
# Drop the neutral reviews
google_binary_reviews = reviews[['clean_body','google_word2vec', 'binary_rate']].dropna()
# Create our X dataset
X_google_word2vec = np.array(google_binary_reviews['google_word2vec'].values.tolist())
y_google_word2vec = google_binary_reviews['binary_rate']
X_train_google_word2vec, X_test_google_word2vec, y_train_google_word2vec, y_test_google_word2vec = train_test_split(X_google_word2vec, y_google_word2vec, test_size=0.2, random_state=200)

In [None]:
def report_print(training_report, testing_report):
  # Training report of model
  tr_accuracy = training_report['accuracy']
  tr_precision = training_report['1.0']['precision']
  tr_recall = training_report['1.0']['recall']
  tr_f1_score = training_report['1.0']['f1-score']

  # Testing report of model
  te_accuracy = testing_report['accuracy']
  te_precision = testing_report['1.0']['precision']
  te_recall = testing_report['1.0']['recall']
  te_f1_score = testing_report['1.0']['f1-score']
  print(f'{tr_accuracy}, {tr_precision}, {tr_recall}, {tr_f1_score}, {te_accuracy}, {te_precision}, {te_recall}, {te_f1_score}')

### Perceptron trained word2vec

In [None]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

clfPercep = Perceptron()
cv=StratifiedKFold(n_splits=5)
parameters = {'penalty':('l2','l1') ,'alpha': [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3]}
gridsearch = GridSearchCV(clfPercep, parameters,
									          cv=cv, scoring='f1')

# Find the best params with grid search
gridsearch.fit(X_train_word2vec, y_train_word2vec)
print("Best params: {}".format(gridsearch.best_params_))
print("Best f1 score: %.5f" % gridsearch.best_score_)

Best params: {'alpha': 0.0001, 'penalty': 'l1'}
Best f1 score: 0.80147


In [None]:
# Train the perceptron
clfPercep = Perceptron(alpha= 0.0001, penalty='l1')
clfPercep.fit(X_train_word2vec, y_train_word2vec)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty='l1', random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
print("Perceptron model report on training and testing data: \n")
training_report = classification_report(y_train_word2vec, clfPercep.predict(X_train_word2vec), output_dict=True)
testing_report = classification_report(y_test_word2vec, clfPercep.predict(X_test_word2vec), output_dict=True)
report_print(training_report, testing_report)

Perceptron model report on training and testing data: 

0.84135, 0.831537565521258, 0.8563074770091963, 0.8437407662759775, 0.84275, 0.831379009593953, 0.8592748397435898, 0.8450967837265428


### Perceptron TFIDF

In [None]:
clfPercep = Perceptron()
clfPercep.fit(X_train_tfidf, y_train_tfidf)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
print("Perceptron model report on training and testing data: \n")
training_report = classification_report(y_train_tfidf, clfPercep.predict(X_train_tfidf), output_dict=True)
testing_report = classification_report(y_test_tfidf, clfPercep.predict(X_test_tfidf), output_dict=True)
report_print(training_report, testing_report)

Perceptron model report on training and testing data: 

0.9461875, 0.9419787618505409, 0.9509946021591363, 0.9464652117142324, 0.85395, 0.8507996423959472, 0.8578725961538461, 0.8543214802254252


### Perceptron Google word2vec

In [None]:
clfPercep = Perceptron()
cv=StratifiedKFold(n_splits=5)
parameters = {'penalty':('l2','l1') ,'alpha': [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3]}
gridsearch = GridSearchCV(clfPercep, parameters,
									          cv=cv, scoring='f1')

# Find the best params with grid search
gridsearch.fit(X_train_google_word2vec, y_train_google_word2vec)
print("Best params: {}".format(gridsearch.best_params_))
print("Best f1 score: %.5f" % gridsearch.best_score_)

Best params: {'alpha': 0.0003, 'penalty': 'l2'}
Best f1 score: 0.79345


In [None]:
print("Perceptron model report on training and testing data: \n")
training_report = classification_report(y_train_google_word2vec, gridsearch.predict(X_train_google_word2vec), output_dict=True)
testing_report = classification_report(y_test_google_word2vec, gridsearch.predict(X_test_google_word2vec), output_dict=True)
report_print(training_report, testing_report)

Perceptron model report on training and testing data: 

0.7850375, 0.7313221012509377, 0.9014144342263095, 0.8075084789397687, 0.7835, 0.7309263192288842, 0.8962339743589743, 0.8051831188697922


### SVM trained word2vec

In [None]:
from sklearn.svm import LinearSVC
clfSVM = LinearSVC()
clfSVM.fit(X_train_word2vec, y_train_word2vec)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
print("SVM model report on training and testing data: \n")
training_report = classification_report(y_train_word2vec, clfSVM.predict(X_train_word2vec), output_dict=True)
testing_report = classification_report(y_test_word2vec, clfSVM.predict(X_test_word2vec), output_dict=True)
report_print(training_report, testing_report)

SVM model report on training and testing data: 

0.87375, 0.8720155192996418, 0.8761995201919233, 0.8741025129637016, 0.87155, 0.8685021369645165, 0.8752003205128205, 0.8718383636817161


### SVM TFIDF

In [None]:
clfSVM = LinearSVC()
clfSVM.fit(X_train_tfidf, y_train_tfidf)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
print("SVM model report on training and testing data: \n")
training_report = classification_report(y_train_tfidf, clfSVM.predict(X_train_tfidf), output_dict=True)
testing_report = classification_report(y_test_tfidf, clfSVM.predict(X_test_tfidf), output_dict=True)
report_print(training_report, testing_report)

SVM model report on training and testing data: 

0.947, 0.9467978819062843, 0.9472710915633746, 0.9470344276220456, 0.89295, 0.8941602171072469, 0.8910256410256411, 0.8925901770932624


### SVM my Google word2vec

In [None]:
clfSVM = LinearSVC()
clfSVM.fit(X_train_google_word2vec, y_train_google_word2vec)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
print("SVM model report on training and testing data: \n")
training_report = classification_report(y_train_google_word2vec, clfSVM.predict(X_train_google_word2vec), output_dict=True)
testing_report = classification_report(y_test_google_word2vec, clfSVM.predict(X_test_google_word2vec), output_dict=True)
report_print(training_report, testing_report)

SVM model report on training and testing data: 

0.83935, 0.8398558702832549, 0.838764494202319, 0.8393098274568641, 0.83495, 0.8366072327994358, 0.8318309294871795, 0.8342122444879715


# Feedforward Neural Network

### Neural Network trained Word2vec on binary data

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(50,10), activation='relu' ,random_state=1, max_iter=10000).fit(X_train_word2vec, y_train_word2vec)

In [None]:
print("MLP model report on training and testing on binary data: \n")
training_report = classification_report(y_train_word2vec, mlp.predict(X_train_word2vec), output_dict=True)
testing_report = classification_report(y_test_word2vec, mlp.predict(X_test_word2vec), output_dict=True)
report_print(training_report, testing_report)

MLP model report on training and testing binary data: 

0.9869375, 0.9857409946404089, 0.9881797281087565, 0.9869588548751419, 0.83575, 0.8342480790340285, 0.8373397435897436, 0.8357910522369408


### Neural Network trained Word2vec on trinary data

In [None]:
# Create our X dataset
X_word2vec_trinary = np.array(reviews['word2vec'].values.tolist())
y_word2vec_trinary = reviews['trinary_rate']
X_train_word2vec_trinary, X_test_word2vec_trinary, y_train_word2vec_trinary, y_test_word2vec_trinary = train_test_split(X_word2vec_trinary, y_word2vec_trinary, test_size=0.2, random_state=200)

In [None]:
parameters = {
    'hidden_layer_sizes': [(50,10)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05, 0.001],
    'learning_rate': ['constant','adaptive', 'invscaling'],
    'random_state': [1,2,3],
    'max_iter': [300]
}

clf = GridSearchCV(MLPClassifier(), parameters, verbose=2, n_jobs=-1)

clf.fit(X_train_word2vec_trinary, y_train_word2vec_trinary)
print(clf.score(X_train_word2vec_trinary, y_train_word2vec_trinary))
print(clf.best_params_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 63.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 136.1min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 197.1min finished


0.7074333333333334
{'activation': 'tanh', 'alpha': 0.05, 'hidden_layer_sizes': (50, 10), 'learning_rate': 'constant', 'max_iter': 300, 'random_state': 2, 'solver': 'sgd'}




In [None]:
mlp_trinary = MLPClassifier(activation='tanh', alpha=0.05, hidden_layer_sizes= (50, 10), learning_rate= 'constant', max_iter=1000, random_state= 2, solver= 'sgd', verbose=True).fit(X_train_word2vec_trinary, y_train_word2vec_trinary)

In [None]:
print("MLP model report on training and testing on trinary data: \n")
training_report = classification_report(y_train_word2vec_trinary, mlp_trinary.predict(X_train_word2vec_trinary), output_dict=True)
testing_report = classification_report(y_test_word2vec_trinary, mlp_trinary.predict(X_test_word2vec_trinary), output_dict=True)
report_print(training_report, testing_report)

MLP model report on training and testing on trinary data: 

0.8382333333333334, 0.7975259039966166, 0.7508461079036433, 0.7734823625922886, 0.6365, 0.5424452749599573, 0.5139099645928173, 0.5277922077922077


### Neural Network pre-trained google Word2vec on binary data

In [None]:
# TODO MLP training
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(50,10), activation='relu' ,random_state=1, max_iter=300).fit(X_train_google_word2vec, y_train_google_word2vec)



In [None]:
print("MLP model report on training and testing on binary data: \n")
training_report = classification_report(y_train_google_word2vec, mlp.predict(X_train_google_word2vec), output_dict=True)
testing_report = classification_report(y_test_google_word2vec, mlp.predict(X_test_google_word2vec), output_dict=True)
report_print(training_report, testing_report)

MLP model report on training and testing on binary data: 

0.914075, 0.9116609529487752, 0.9170831667333067, 0.9143640214276816, 0.8368, 0.8368758772809304, 0.8360376602564102, 0.8364565587734241


### Neural Network pre-trained google Word2vec on trianry data

In [None]:
X_word2vec_google_trinary = np.array(reviews['google_word2vec'].values.tolist())
y_word2vec_google_trinary = reviews['trinary_rate']
X_train_word2vec_google_trinary, X_test_word2vec_google_trinary, y_train_word2vec_google_trinary, y_test_word2vec_google_trinary = train_test_split(X_word2vec_google_trinary, y_word2vec_google_trinary, test_size=0.2, random_state=200)
mlp_trinary = MLPClassifier(hidden_layer_sizes=(50,10), activation='relu' ,random_state=1, max_iter=300).fit(X_train_word2vec_google_trinary, y_train_word2vec_google_trinary)



In [None]:
print("MLP model report on training and testing on binary data: \n")
training_report = classification_report(y_train_word2vec_google_trinary, mlp_trinary.predict(X_train_word2vec_google_trinary), output_dict=True)
testing_report = classification_report(y_test_word2vec_google_trinary, mlp_trinary.predict(X_test_word2vec_google_trinary), output_dict=True)
report_print(training_report, testing_report)

MLP model report on training and testing on binary data: 

0.8157666666666666, 0.750701907251428, 0.7718494923352578, 0.7611288343558281, 0.5903333333333334, 0.48189280540801543, 0.5048052604957005, 0.49308300395256915


## Binary Classification

In [None]:
# Trained only first 10 words
import numpy as np

indexs = reviews.index
reviews['word2vec_10'] = pd.Series(dtype=object)
for idx , review in zip(indexs, reviews['clean_body']):
  count_words = 0
  unseen_words = 0
  x = 0
  for word in review.split():
    try:
      x = x + w2v_model[word]
      count_words = count_words + 1
    except KeyError:
      unseen_words = unseen_words + 1
      # reviews.at[idx, 'word2vec_10'] = np.NaN
      continue
  
  x = x/(10)
  x1 = x.reshape(-1, 1)
  x1 = x1.T
  reviews.at[idx, 'word2vec_10'] = x1[0]

  if sys.path[0] == '':


In [None]:
binary_reviews_10 = reviews[['clean_body','word2vec_10', 'binary_rate']].dropna()
# Create our X dataset
X_word2vec_10 = np.array(binary_reviews_10['word2vec_10'].values.tolist())
y_word2vec_10 = binary_reviews['binary_rate']
X_train_word2vec_10, X_test_word2vec_10, y_train_word2vec_10, y_test_word2vec_10 = train_test_split(X_word2vec_10, y_word2vec_10, test_size=0.2, random_state=200)

In [None]:
mlp_10 = MLPClassifier(hidden_layer_sizes=(50,10), activation='relu' ,random_state=1, max_iter=300).fit(X_train_word2vec_10, y_train_word2vec_10)

In [None]:
print("MLP model report on training and testing on binary data: \n")
training_report = classification_report(y_train_word2vec_10, mlp_10.predict(X_train_word2vec_10), output_dict=True)
testing_report = classification_report(y_test_word2vec_10, mlp_10.predict(X_test_word2vec_10), output_dict=True)
report_print(training_report, testing_report)

MLP model report on training and testing on binary data: 

0.9695625, 0.9736344617245986, 0.9652888844462215, 0.9694437124320797, 0.8408, 0.8441992306134846, 0.8352363782051282, 0.8396938878259995


### Trinary Classification

Classifying amazon reviews into Positive, Neutral, Negative. Achieved a 58 f1 score

In [None]:
X_word2vec_trinary_10 = np.array(reviews['word2vec'].values.tolist())
y_word2vec_trinary_10 = reviews['trinary_rate']
X_train_word2vec_trinary_10, X_test_word2vec_trinary_10, y_train_word2vec_trinary_10, y_test_word2vec_trinary_10 = train_test_split(X_word2vec_trinary_10, y_word2vec_trinary_10, test_size=0.2, random_state=200)

In [None]:
mlp_trinary_10 = MLPClassifier(solver='lbfgs', activation='tanh', alpha=0.05, hidden_layer_sizes= (50, 10), learning_rate= 'constant', max_iter=300, random_state= 2, solver= 'sgd', verbose=True).fit(X_train_word2vec_trinary_10, y_train_word2vec_trinary_10)

In [None]:
print("MLP model report on training and testing on binary data: \n")
training_report = classification_report(y_train_word2vec_trinary_10, mlp_trinary_10.predict(X_train_word2vec_trinary_10), output_dict=True)
testing_report = classification_report(y_test_word2vec_trinary_10, mlp_trinary_10.predict(X_test_word2vec_trinary_10), output_dict=True)
report_print(training_report, testing_report)

MLP model report on training and testing on binary data: 

0.709525, 0.6116304509969668, 0.6146562062762343, 0.61313959574309, 0.6840666666666667, 0.5846339744235223, 0.5819967923015237, 0.5833124026724268
