# Modeling

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_feather('data/reviews.feather').set_index('index')
df

Unnamed: 0_level_0,review,voted_up
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,I wanted to wait until I had 100 hours into th...,True
1,"I don't know how these devs did it, but I have...",True
2,"Has more game play, less bugs, and is polished...",True
3,I am very impressed with this game. Its worth...,True
4,Imagine if Rust and Runescape had a baby (with...,True
...,...,...
90250,got earraped at the start 5/5 would recommend,True
90251,Best remake of the time <3,True
90252,its like 60 seconds.,True
90253,uhhh for some reason i got this game for free ...,True


In [3]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=404)
X_train, y_train = df_train['review'], df_train['voted_up']
X_test, y_test = df_test['review'], df_test['voted_up']
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((72183,), (72183,), (18046,), (18046,))

## Processing

In [4]:
from processing import Preprocessor, DenseTransformer
from gensim.sklearn_api import D2VTransformer
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrew\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
pipe_tf = [('preprocessor', Preprocessor()),
           ('tf-idf_vectorizer', TfidfVectorizer(max_features=10000)),
           ('to_dense', DenseTransformer())]

In [6]:
pipe_bigrams = [('preprocessor', Preprocessor(remove_stopwords=False)),
                ('tf-idf_vectorizer', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
                ('to_dense', DenseTransformer())]

In [7]:
pipe_embed = [('preprocessor', Preprocessor(split=True)),
              ('document_vectorizer', D2VTransformer()),
              ('scaler', MinMaxScaler((1, 2)))]

In [21]:
p = Preprocessor(split=True)
d = D2VTransformer()
a = p.transform(X_train)

In [24]:
d.fit(a, y_train)
print('1')
b = d.transform(a)

1


In [31]:
b.shape

(1, 100)

In [30]:
len(a), len(y_train), len(b)

(72183, 72183, 1)

In [8]:
datasets = [('TF-IDF', pipe_tf),
            ('Bigrams', pipe_bigrams),
            ('Embeddings', pipe_embed)]

## Models

In [9]:
from itertools import product
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [12]:
models = [('Logistic Regression', LogisticRegression(solver='saga')),
          ('Multinomial Naive Bayes', MultinomialNB()),
          ('Random Forest', RandomForestClassifier())]

In [13]:
dummy_pipeline = [(('Dummy', DummyClassifier()), ('none', []))]
metrics = []

for (model_name, model), (data_name, data) in dummy_pipeline+list(product(models, datasets)):
    print(model_name, data_name)
    steps = data + [('model', model)]
    pipe = Pipeline(steps)
    pipe.fit(X_train, y_train)
    
    train_acc = pipe.score(X_train, y_train)
    test_acc = pipe.score(X_test, y_test)
    score = {'model': model_name,
             'processing': data_name,
             'train_accuracy': train_acc,
             'test_accuracy': test_acc}
    metrics.append(score)

metrics_df = pd.DataFrame(metrics)
metrics_df.sort_values(by='test_accuracy', ascending=False)

Dummy none
Logistic Regression TF-IDF
Logistic Regression Bigrams
Logistic Regression Embeddings


ValueError: Found input variables with inconsistent numbers of samples: [1, 72183]

In [None]:
len(X_train), len(y_train)

In [None]:
metrics