# Data Processing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_feather('data/reviews_raw.feather').set_index('index')
df

Unnamed: 0_level_0,review,voted_up
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,I wanted to wait until I had 100 hours into th...,True
1,"I don't know how these devs did it, but I have...",True
2,"Has more game play, less bugs, and is polished...",True
3,I am very impressed with this game. Its worth...,True
4,Imagine if Rust and Runescape had a baby (with...,True
...,...,...
73091,"70 hours in. No crashes, no slow-mo glitches, ...",True
73092,We still need a good GM mode.,False
73093,WWE 2K19 is a wrestling simulation game. It's ...,True
73094,"Alrightie, where do I begin?\nI fell in love w...",False


In [3]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=404)
X_train, y_train = df_train['review'].tolist(), df_train['voted_up'].tolist()
X_test, y_test = df_test['review'].tolist(), df_test['voted_up'].tolist()
len(X_train), len(y_train), len(X_test), len(y_test)

(58476, 58476, 14620, 14620)

In [4]:
pd.DataFrame(y_train, columns=['voted_up']).to_feather('data/processed/y_train.feather')
pd.DataFrame(y_test, columns=['voted_up']).to_feather('data/processed/y_test.feather')

## Preprocessing

In [4]:
import preprocessing
from nltk.corpus import stopwords
from string import punctuation

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
X_train_pre = list(map(preprocessing.remove_markdown, X_train))
X_test_pre = list(map(preprocessing.remove_markdown, X_test))

In [6]:
X_train_pre = list(map(preprocessing.remove_punctuation, X_train_pre))
X_test_pre = list(map(preprocessing.remove_punctuation, X_test_pre))

In [7]:
X_train_pre = list(map(preprocessing.tokenize, X_train_pre))
X_test_pre = list(map(preprocessing.tokenize, X_test_pre))

In [8]:
X_train_pre = list(map(preprocessing.lemmatize, X_train_pre))
X_test_pre = list(map(preprocessing.lemmatize, X_test_pre))

In [9]:
X_train_join = [' '.join(x) for x in X_train_pre]
X_test_join = [' '.join(x) for x in X_test_pre]

In [10]:
stopwords_list = stopwords.words('english') + list(punctuation) + ['`', '’', '…', '\n']

## Feature Engineering

### TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tf = TfidfVectorizer(max_features=8000, stop_words=stopwords_list)
X_train_tf = pd.DataFrame(tf.fit_transform(X_train_join).todense(), columns=tf.get_feature_names())
X_test_tf = pd.DataFrame(tf.transform(X_test_join).todense(), columns=tf.get_feature_names())

In [14]:
X_train_tf.to_feather('data/processed/X_train_tf.feather')
X_test_tf.to_feather('data/processed/X_test_tf.feather')

### TF-IDF with Bigrams

In [13]:
import pickle

In [14]:
tf_bigram = TfidfVectorizer(max_features=8000, ngram_range=(1,2))
X_train_bigram = pd.DataFrame(tf_bigram.fit_transform(X_train_join).todense(), columns=tf_bigram.get_feature_names())
X_test_bigram = pd.DataFrame(tf_bigram.transform(X_test_join).todense(), columns=tf_bigram.get_feature_names())

In [None]:
X_train_bigram.to_feather('data/processed/X_train_bigram.feather')
X_test_bigram.to_feather('data/processed/X_test_bigram.feather')

In [16]:
pickle.dump(tf_bigram, open('final_model/vectorizer.pk', 'wb'))

### Document Embeddings

In [11]:
from gensim.sklearn_api import D2VTransformer
from sklearn.preprocessing import MinMaxScaler

In [12]:
d2v = D2VTransformer()
X_train_embed = d2v.fit_transform(X_train_pre)
X_test_embed = d2v.transform(X_test_pre)

scaler = MinMaxScaler((1, 2))
X_train_embed = pd.DataFrame(scaler.fit_transform(X_train_embed))
X_test_embed = pd.DataFrame(scaler.transform(X_test_embed))

X_train_embed.columns = X_train_embed.columns.astype(str)
X_test_embed.columns = X_test_embed.columns.astype(str)

In [16]:
X_train_embed.to_feather('data/processed/X_train_embed.feather')
X_test_embed.to_feather('data/processed/X_test_embed.feather')