# Data Processing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_feather('data/reviews.feather').set_index('index')
df

Unnamed: 0_level_0,review,voted_up
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,I wanted to wait until I had 100 hours into th...,True
1,"I don't know how these devs did it, but I have...",True
2,"Has more game play, less bugs, and is polished...",True
3,I am very impressed with this game. Its worth...,True
4,Imagine if Rust and Runescape had a baby (with...,True
...,...,...
90250,got earraped at the start 5/5 would recommend,True
90251,Best remake of the time <3,True
90252,its like 60 seconds.,True
90253,uhhh for some reason i got this game for free ...,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90229 entries, 0 to 90254
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   review    90229 non-null  object
 1   voted_up  90229 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 1.5+ MB


In [4]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=404)
X_train, y_train = df_train['review'].tolist(), df_train['voted_up'].tolist()
X_test, y_test = df_test['review'].tolist(), df_test['voted_up'].tolist()
len(X_train), len(y_train), len(X_test), len(y_test)

(72183, 72183, 18046, 18046)

In [5]:
pd.DataFrame(y_train, columns=['voted_up']).to_feather('data/processed/y_train.feather')
pd.DataFrame(y_test, columns=['voted_up']).to_feather('data/processed/y_test.feather')

## Preprocessing

In [6]:
import preprocessing
from nltk.corpus import stopwords
from string import punctuation

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
X_train_pre = list(map(preprocessing.remove_markdown, X_train))
X_test_pre = list(map(preprocessing.remove_markdown, X_test))

In [8]:
X_train_pre = list(map(preprocessing.remove_punctuation, X_train_pre))
X_test_pre = list(map(preprocessing.remove_punctuation, X_test_pre))

In [9]:
X_train_pre = list(map(preprocessing.tokenize, X_train_pre))
X_test_pre = list(map(preprocessing.tokenize, X_test_pre))

In [10]:
X_train_pre = list(map(preprocessing.lemmatize, X_train_pre))
X_test_pre = list(map(preprocessing.lemmatize, X_test_pre))

In [11]:
X_train_join = [' '.join(x) for x in X_train_pre]
X_test_join = [' '.join(x) for x in X_test_pre]

In [12]:
stopwords_list = stopwords.words('english') + list(punctuation) + ['`', '’', '…', '\n']

## Feature Engineering

### TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tf = TfidfVectorizer(max_features=8000, stop_words=stopwords_list)
X_train_tf = pd.DataFrame(tf.fit_transform(X_train_join).todense(), columns=tf.get_feature_names())
X_test_tf = pd.DataFrame(tf.transform(X_test_join).todense(), columns=tf.get_feature_names())

In [None]:
X_train_tf.to_feather('data/processed/x_train_tf.feather')
X_test_tf.to_feather('data/processed/x_test_tf.feather')

### TF-IDF with Bigrams

In [None]:
tf_bigram = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_bigram = tf_bigram.fit_transform(X_train_join).todense()
X_test_bigram = tf_bigram.transform(X_test_join).todense()

### Document Embeddings

In [None]:
from gensim.sklearn_api import D2VTransformer
from sklearn.preprocessing import MinMaxScaler

In [None]:
from processing import Preprocessor, DenseTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [None]:
pipe_tf = Pipeline([('preprocessor', Preprocessor()),
                    ('tf-idf_vectorizer', TfidfVectorizer(max_features=10000)),
                    ('to_dense', DenseTransformer())])

In [None]:
pipe_bigrams = Pipeline([('preprocessor', Preprocessor(remove_stopwords=False)),
                         ('tf-idf_vectorizer', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
                         ('to_dense', DenseTransformer())])

In [None]:
pipe_tf = Pipeline([('preprocessor', Preprocessor(split=True)),
                    ('document_vectorizer', D2VTransformer()),
                    ('scaler', MinMaxScaler((1, 2)))])

In [None]:
pipe_tf.fit(X_train, y_train)
x_train_tf = pipe_tf.transform(X_train)
x_test_tf = pipe_tf.transform(X_test)