# Machine Learning Project 02

# Load the data

In [None]:
import numpy as np
import pandas as pd

x_train_df = pd.read_csv('data/data_reviews/x_train.csv')
y_train_df = pd.read_csv('data/data_reviews/y_train.csv')

tr_text_list = x_train_df['text'].values.tolist()
for text in tr_text_list:
    print(text)

# preprocessing the data

Natural Language ToolKit (nltk) is used to preprocess the data.

1. Turn all sentences to lowercase
2. Delete apostrophes
3. Delete punctuations
4. Remove stop words
5. Lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# word count 
cv = CountVectorizer()
word_count_vector = cv.fit_transform(tr_text_list)
print(word_count_vector.shape)

# apostrophes
appos = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}

# Preprocess Data
def preprocess(review_arr):
  processed = []

  # lowercase
  for text in review_arr:
    processed.append(text.lower())

  # convert apostrophes to standard lexicon 
  negation = []
  for text in processed:
    words = text.split()
    reformed = [appos[word] if word in appos else word for word in words]
    reformed = " ".join(reformed)
    negation.append(reformed)

  # tokenize data by converting text to tokens
  tokenized = []
  for text in negation:
    tokenized.append(word_tokenize(text))

  # remove stopwords
  stop_words = set(stopwords.words('english'))
  for text in tokenized:
    text = [i for i in text if not i in stop_words]
  
  # remove stand-alone punctuation
  stripped = []
  for text in tokenized:
    words = [word for word in text if word.isalpha()]
    stripped.append(words)

  # lemmatization
  lemmas = []
  porter = PorterStemmer()
  for text in stripped:
    lemmafied = []
    for t in text:
      lemmafied.append(porter.stem(t))
    lemmas.append(lemmafied)

  return lemmas


p = preprocess(tr_text_list)
for i in range(20):
    print(p[i])

## Vectorize preprocessed data to feature vectors using Bag of Words Model and TF-IDF (TfidVectorizer and TfidTransformer)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = TfidfVectorizer()

# feature transform training set
x_train_df = pd.read_csv('data/data_reviews/x_train.csv')
y_train_df = pd.read_csv('data/data_reviews/y_train.csv')
tr_text_list = x_train_df['text'].values.tolist()
x_tr_pre = preprocess(tr_text_list)
x_tr = []
for text in x_tr_pre:
  sentence = " ".join(text)
  x_tr.append(sentence)
x_train = vectorizer.fit_transform(x_tr)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

# feature transform testing set
x_test_df = pd.read_csv('data/data_reviews/x_test.csv')
te_text_list = x_test_df['text'].values.tolist()
x_te_pre = preprocess(te_text_list)
x_te = [] 
for text in x_te_pre:
  sentence = " ".join(text)
  x_te.append(sentence)
x_test = vectorizer.transform(x_te)

print(x_train.shape, x_test.shape)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV
import seaborn as sns

logreg = LogisticRegression()

# Grid Search for Hyperparameters
penalty = ['none','l2']
C = np.logspace(0, 6, 12)
hyperparams = dict(C=C, penalty=penalty)
clf = GridSearchCV(logreg, hyperparams, cv=5, verbose=0)
clf.fit(x_train, y_train_df)
print("BEST SCORE: ")
print(clf.best_score_)
print("STANDARD DEVIATIONS")
print(clf.cv_results_['std_test_score'])
print("STANDARD DEVIATION FOR BEST SCORE:")
print(clf.cv_results_['std_test_score'][clf.best_index_])

pivot = pd.pivot_table(pd.DataFrame(clf.cv_results_), values='mean_test_score', index='param_C', columns='param_penalty')
ax = sns.heatmap(pivot)

yproba1_test = clf.predict_proba(x_test)[:, 1]
np.savetxt('logreg_yproba1_test.txt', yproba1_test)

## Multilayer Perceptron (MLP) Model with Bag-Of-Words

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
import seaborn as sns

mlp = MLPClassifier(max_iter=120)

hyperparams = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,), (256,)],
    'activation': ['identity','logistic', 'relu', 'tanh'],
}

clf_mlp = GridSearchCV(mlp, hyperparams, cv=5, verbose=0)
clf_mlp.fit(x_train, y_train_df.values.ravel()) 
print("BEST SCORE: ")
print(clf_mlp.best_score_)
print("STANDARD DEVIATIONS")
print(clf_mlp.cv_results_['std_test_score'])
print("STANDARD DEVIATION FOR BEST SCORE:")
print(clf_mlp.cv_results_['std_test_score'][clf_mlp.best_index_])

pivot_mlp = pd.pivot_table(pd.DataFrame(clf_mlp.cv_results_), values='mean_test_score', index='param_hidden_layer_sizes', columns='param_activation')

yproba1_test = clf_mlp.predict_proba(x_test)[:, 1]
np.savetxt('mlp_yproba1_test.txt', yproba1_test)

In [None]:
ax_mlp = sns.heatmap(pivot_mlp)

## K Nearest Neighbors with Bag-Of-Words

In [None]:
from sklearn.neighbors import KNeighborsClassifier

hyperparams = {
    'n_neighbors' : [1,3,5,7,9,11,13,17,19,21,23,25,50,100],
    'weights' : ['uniform', 'distance']
    }

clf_knn = GridSearchCV(KNeighborsClassifier(), hyperparams, cv=5, verbose=0)
clf_knn.fit(x_train, y_train_df.values.ravel()) 
print("BEST SCORE: ")
print(clf_knn.best_score_)
print("STANDARD DEVIATIONS")
print(clf_knn.cv_results_['std_test_score'])
print("STANDARD DEVIATION FOR BEST SCORE:")
print(clf_knn.cv_results_['std_test_score'][clf_knn.best_index_])

pivot_knn = pd.pivot_table(pd.DataFrame(clf_knn.cv_results_), values='mean_test_score', index='param_n_neighbors', columns='param_weights',)
ax_knn = sns.heatmap(pivot_knn)

yproba1_test = clf_knn.predict_proba(x_test)[:, 1]
np.savetxt('knn_yproba1_test.txt', yproba1_test)

## Results

Logistic Regression classifier performs best among all. Other classifiers like MLP and KNN perform worse. This is partly because of the feature tuning by trying to normalize the input data into feature vectors using TF-IDF. 


I believe a large part of this success has to do with the feature tuning performed -- by so rigorously trying to normalize the input data into feature vectors using TF-IDF, it seems to me that the Logistic Regression classifier worked the best as it usually performs the best when attributes unrelated to the output variable, as well as closely related attributes are removed from the input set. This was most perceptible within the steps of removing stop words (unrelated attributes to the feature set), and the TF-IDF vectorization (penalizing closely related attributes). In combination with the regularization performed on the model, it seems that normalizing the data did make the output classes rather separable.

Using a KNN model may have not been as suited for this task as the number of output classes was limited to 2. Possible reasons for why the KNN model and MLP model may have not worked as best, or in conjunction with my original hypothesis, may also be as a consequence of underfitting, and a lack of greater parameter tuning (which may have resulted in better performance.)

The logistic regression model did best on predicting positive values with data sourced from Amazon, with a true positive rate of .9675, compared to .9475 and .935 for Yelp and IMDb, respectively. The model predicted true negatives est fro Yelp reviews, with a true negative rate of 0.98, compared to .9775 and .9725 for Amazon and IMDb, respectively. The model also had a higher false positive rate for Yelp reviews, at a false positive rate of 0.0525, compared to 0.0325 and 0.065 for Amazon and IMDb reviews. Interestingly, it also had the lowest true negative rates for Yelp reviews, at 0.02, compared to 0.0225 and 0.275 for Amazon and IMDb, respectively. Overall, in terms of total accuracy, the model perofrmed best on Amazon reviews, at an accuracy of 0.9725, compared to 0.96375 and 0.95375 for Yelp and IMDb, respectively. Possible reasons for this may be due to the number of sentimental (positive/negative) words within the feature sets for each type of review -- Amazon customers may have better use of language in line with how the model calculates an output class for sentiment, but conversely, my model may also overfit for Amazon reviews.

## Applying Best Classifier to Leaderboard

Using the test set on the GradeScope leaderboard, the logistic regression model gave an error rate of 0.16167 and an AUROC of 0.9061, putting me 11th place on the leaderboard at the time of writing (out of 64). This matches up with what the training set performance eluded to, given that both the training set and the test set had the best performance compared to the other classfiers. This may suggest that the testing data may be similar in terms of tf-idf values with the training data, and the Logistic Regression model may have overfit on the training data, leading to an increased performance on the testing data (which is similar, in this case). It could also mean that the other models overfit on training data, which in this scenario, is not as similar in comparison to the testing data.