## 1 - Data Collection

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/test_reviews.csv')
df

Unnamed: 0,review,voted_up
0,While it does feel like they needed a bit more...,True
1,Game is asbolutely good. The Night City is som...,True
2,This game has a JoJo reference.,True
3,"Cheers everyone, after 8 years we finally made...",True
4,made my penis to perfection in a call with fri...,True
...,...,...
1995,The game doesn't bring anything new to the tab...,False
1996,pp go smol ( ͡° ͜ʖ ͡°)\n\npp go big (˵ ͡☉ ͜ʖ ͡...,True
1997,"Great characters, nice city, thrilling storyli...",True
1998,So here is my review after all of this time.\n...,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   review    2000 non-null   object
 1   voted_up  2000 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 17.7+ KB


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
df_train, df_test = train_test_split(df, random_state=212)
X_train, y_train = df_train['review'], df_train['voted_up']
X_test, y_test = df_test['review'], df_test['voted_up']
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1500,), (1500,), (500,), (500,))

## 2 - EDA

TODO:
- frequency distribution
- total vocab
- word clouds

comparative EDA - compare values in both classes

watch topic 39 video at around 52:00

## 3 - Pre-Processing

### Tokenization

In [6]:
import nltk
from nltk import word_tokenize
import numpy as np
nltk.download('punkt')

In [7]:
def tokenize(review):
    return word_tokenize(review.lower())

In [68]:
X_train_tokenized = list(map(tokenize, X_train))
X_test_tokenized = list(map(tokenize, X_test))
len(X_train_tokenized), len(X_test_tokenized)

(1500, 500)

### Stop-Words Removal

In [9]:
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')

In [69]:
stopwords_list = stopwords.words('english') + list(punctuation) + ["''", '""', '...', '``', '’', '…']
len(stopwords_list)

217

In [70]:
X_train_stopworded = [[word for word in review if word not in stopwords_list] for review in X_train_tokenized]
X_test_stopworded = [[word for word in review if word not in stopwords_list] for review in X_test_tokenized]
len(X_train_stopworded), len(X_test_stopworded)

(1500, 500)

### Lemmatization

In [16]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [71]:
lemmatizer = WordNetLemmatizer() 
X_train_lemmatized = [list(map(lemmatizer.lemmatize, review)) for review in X_train_stopworded]
X_test_lemmatized = [list(map(lemmatizer.lemmatize, review)) for review in X_test_stopworded]
len(X_train_lemmatized), len(X_test_lemmatized)

(1500, 500)

### Finalizing

In [72]:
X_train_preprocessed = [' '.join(review) for review in X_train_lemmatized]
X_test_preprocessed = [' '.join(review) for review in X_test_lemmatized]
len(X_train_preprocessed), len(X_test_preprocessed)

(1500, 500)

## 4 - Feature Engineering

### Bag of Words

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

In [74]:
cv = CountVectorizer()
X_train_bow = pd.DataFrame(cv.fit_transform(X_train_preprocessed).todense(), columns=cv.get_feature_names())
X_test_bow = pd.DataFrame(cv.transform(X_test_preprocessed).todense(), columns=cv.get_feature_names())
X_train_bow.shape, X_test_bow.shape

((1500, 12616), (500, 12616))

### TF-IDF

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [76]:
tf = TfidfVectorizer()
X_train_tf = pd.DataFrame(tf.fit_transform(X_train_preprocessed).todense(), columns=tf.get_feature_names())
X_test_tf = pd.DataFrame(tf.transform(X_test_preprocessed).todense(), columns=tf.get_feature_names())
X_train_tf.shape, X_test_tf.shape

((1500, 12616), (500, 12616))

### Word Embedding

### n-grams

### PCA

## 5 - Model Building and Evaluation

### Basic Model

In [38]:
from sklearn.metrics import accuracy_score, f1_score

In [26]:
y_train.value_counts(normalize=True)

True     0.596667
False    0.403333
Name: voted_up, dtype: float64

In [34]:
y_hat_train_basic = [True]*len(y_train)
y_hat_test_basic = [True]*len(y_test)

In [42]:
basic_train_f1 = f1_score(y_train, y_hat_train_basic)
basic_test_f1 = f1_score(y_test, y_hat_test_basic)
basic_train_accuracy = accuracy_score(y_train, y_hat_train_basic)
basic_test_accuracy = accuracy_score(y_test, y_hat_test_basic)

print('Basic Model')
print(f'Training F1 Score:\t{round(basic_train_f1, 2)}\tAccuracy:\t{round(basic_train_accuracy, 2)}')
print(f'Testing F1 Score:\t{round(basic_test_f1, 2)}\tAccuracy:\t{round(basic_train_accuracy, 2)}')

Basic Model
Training F1 Score:	0.75	Accuracy:	0.6
Testing F1 Score:	0.77	Accuracy:	0.6


### Baseline Models

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [None]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

In [None]:
nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

In [None]:
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print('-'*70)
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

naive bayes, random forests, word2vec?, svc, logistic regression

get important features from random forests (classifier.feature_importances_)

### Gridsearch

Don't need to do this for preliminaries, just add this into final notebooks.

## Word Vectorizer Model

In [None]:
from nltk import word_tokenize

In [None]:
X_train_tokenized = X_train.map(word_tokenize).values

In [None]:
X_train_tokenized[0]

In [None]:
total_vocabulary = set(word for review in X_train_tokenized for word in review)
len(total_vocabulary)
print('There are {} unique tokens in the dataset.'.format(len(total_vocabulary)))

In [None]:
import numpy as np

In [None]:
glove = {}
with open('data/glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [None]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # it can't be used in a scikit-learn pipeline  
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [None]:
from gensim.models import word2vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [None]:
rf =  Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
              ('Random Forest', RandomForestClassifier(n_estimators=100))])
svc = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
              ('Logistic Regression', LogisticRegression())])

In [None]:
glove['game']

In [None]:
models = [('Random Forest', rf),
          ('Support Vector Machine', svc),
          ('Logistic Regression', lr)]

In [None]:
scores = [(name, cross_val_score(model, X_train_tokenized, y_train.values, cv=2).mean()) for name, model, in models]
scores

In [None]:
import tensorflow

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence