# Generate Features from documents

In [156]:
import pandas as pd
import numpy as np
import string
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier

### Read Doc Corpus & chop into TFiDF-BOWs

In [96]:
df = pd.read_csv("../data/processed/data_train_test.csv")
df_test, df_train = df[df["set"]=="test"], df[df["set"]=="train"]
train_review = df_train["review"]
train_lbl = df_train["sentiment"]
test_review = df_test["review"]
test_lbl = df_test["sentiment"]

### Naive Bayes Pipeline

In [50]:
# NAIVE-BAYES classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', MultinomialNB()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None]}]


# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=10,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.3s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.8s finished


In [53]:
nb = _.best_estimator_

In [54]:
nb.score(test_review, test_lbl)

0.8135

# Advanced Shit following

### To be tested
- min/max tfidf
- stemming
- tokenization
- stopwordremoval


# Custom stemmer/tokenizer

In [64]:
# Create a custom Porter Stemmer that suits sklearn
class PortStem(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        return [self.ps.stem(word) for word in word_tokenize(doc)]

In [154]:
class PortStemNoPunctNum(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        return [self.ps.stem(word)
                for word 
                in word_tokenize(
                doc.translate(
                    str.maketrans(string.punctuation + "0123456789",' '*len(string.punctuation + "0123456789"))))]

In [157]:
# NAIVE-BAYES classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', MultinomialNB()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None, "english"]},
              {'tfidf__tokenizer': [None, PortStem(), PortStemNoPunctNum()]}
             ]



# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=2,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    2.6s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   23.6s finished


In [158]:
nb = _.best_estimator_

In [159]:
nb.score(test_review, test_lbl)

0.8135

# KNN

In [162]:
# KNN classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', KNeighborsClassifier()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None]},
              {'tfidf__tokenizer': [None, PortStem(), PortStemNoPunctNum()]},
              {'clf__n_neighbors': [190, 191, 192]},
              {'clf__metric': ['cosine']}
              
             ]



# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=2,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:   28.6s finished


In [163]:
knn = _.best_estimator_

In [164]:
knn.score(test_review, test_lbl)

0.76149999999999995

# Decision Tree
