# Generate Features from documents

In [179]:
import pandas as pd
import numpy as np
import string
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

### Read Doc Corpus & chop into TFiDF-BOWs

In [189]:
df = pd.read_csv("../data/processed/data_train_test.csv")
df_test, df_train = df[df["set"]=="test"], df[df["set"]=="train"]
train_review = df_train["review"].values
train_lbl = df_train["sentiment"].values
test_review = df_test["review"].values
test_lbl = df_test["sentiment"].values

### Naive Bayes Pipeline

In [50]:
# NAIVE-BAYES classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', MultinomialNB()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None]}]


# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=10,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.3s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.8s finished


In [53]:
nb = _.best_estimator_

In [54]:
nb.score(test_review, test_lbl)

0.8135

# Advanced Shit following

### To be tested
- min/max tfidf
- stemming
- stopwordremoval


# Custom stemmer/tokenizer

In [64]:
# Create a custom Porter Stemmer that suits sklearn
class PortStem(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        return [self.ps.stem(word) for word in word_tokenize(doc)]

In [191]:
# NAIVE-BAYES classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', MultinomialNB()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None, "english"]},
              {'tfidf__tokenizer': [None, PortStem()]}
             ]



# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=2,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    2.3s remaining:    6.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   22.1s finished


In [192]:
nb = _.best_estimator_

In [193]:
nb.score(test_review, test_lbl)

0.8135

# KNN

In [175]:
# KNN classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', KNeighborsClassifier()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None]},
              {'tfidf__tokenizer': [None, PortStem()]},
              {'clf__n_neighbors': [190, 191, 192]},
              {'clf__metric': ['cosine']}
              
             ]



# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=2,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 2 folds for each of 7 candidates, totalling 14 fits


[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:   25.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:   25.4s finished


In [173]:
knn = _.best_estimator_

In [164]:
knn.score(test_review, test_lbl)

0.76149999999999995

# Logistic Regression

In [180]:
# Logistic Regression classification pipeline
pl_clf_jobs_lr = Pipeline([('tfidf', TfidfVectorizer()),
                           ('clf', LogisticRegression()),
                          ])

# Parameter grid for hyper parameter tuning 
param_grid_lr = [{'tfidf__stop_words' : [None, 'english'],
                  'tfidf__tokenizer' : [None, PortStem()],
                  'clf__penalty' : ["l1", "l2"],
                  'clf__C' : [1.0, 10.0, 100.0,]
                 }]

# create grid search
gs_clf_jobs_lr = GridSearchCV(pl_clf_jobs_lr,
                              param_grid=param_grid_lr,
                              cv=2,
                              n_jobs=-1,
                              verbose=True
                              )

# run grid search
_ = gs_clf_jobs_lr.fit(train_review, train_lbl)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.5min finished


In [181]:
lr = _.best_estimator_

In [195]:
lr.score(test_review, test_lbl)

0.85199999999999998

# Where to go from here?
- combine word and genre ("greatHORROR")
- ngrams (e.g. 2-grams for "not bad", etc...)

## Loading tabular IMDB to merge w/ enhanced test-train-set

In [503]:
df_imdb = pd.read_csv("../data/processed/IMDB.csv", delimiter=";")
df_test_train_en = pd.read_csv("../data/processed/data_train_test_imdb_ids.csv")

## Interesting meta information to be considered (from IMDB)
- genre
- #votings

In [508]:
df_imdb.columns

Index(['﻿response', 'title', 'year', 'rated', 'released', 'runtime', 'genre',
       'director', 'writer', 'actors', 'plot', 'language', 'country', 'awards',
       'poster', 'metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'type',
       'season', 'episode', 'seriesID'],
      dtype='object')

In [511]:
df_imdb_genres = df_imdb.loc[:, ["imdbID", "genre"]]

In [552]:
test_imdb = df_imdb_genres.head()

In [515]:
test_normal_set = df_test_train_en.head()

In [556]:
col_concatter(test_imdb, column="genre", delim=", ")

Unnamed: 0,imdbID,genre
0,tt0001032,shortdramahorror
1,tt0004736,comedyshort
2,tt0005077,shortcomedy
3,tt0008458,dramaromance
4,tt0012224,comedyshort


In [528]:
test_normal_set

Unnamed: 0,id,imdb_id,rating,review,sentiment,set
0,0,tt0406816,10,I went and saw this movie last night after bei...,pos,test
1,100,tt0406816,10,The finest short I've ever seen. Some commenta...,pos,test
2,101,tt0406816,9,"This is a very, very odd film...one that is so...",pos,test
3,102,tt0406816,8,"Although Bullet In The Brain is, without quest...",pos,test
4,103,tt0406816,10,"...means ""take up and read"", which is precisel...",pos,test


In [529]:
pd.merge(test_normal_set, test_imdb)

Unnamed: 0,id,imdb_id,rating,review,sentiment,set,genre
0,0,tt0406816,10,I went and saw this movie last night after bei...,pos,test,"Action, Adventure, Drama"
1,100,tt0406816,10,The finest short I've ever seen. Some commenta...,pos,test,"Action, Adventure, Drama"
2,101,tt0406816,9,"This is a very, very odd film...one that is so...",pos,test,"Action, Adventure, Drama"
3,102,tt0406816,8,"Although Bullet In The Brain is, without quest...",pos,test,"Action, Adventure, Drama"
4,103,tt0406816,10,"...means ""take up and read"", which is precisel...",pos,test,"Action, Adventure, Drama"


In [568]:
def col_concatter(df, column, delimiter=",", uppercase=True):
    """Concat multiple values of one column
    """
    df_res = df.copy()
    if uppercase:
        col_new = df.apply(
        lambda row:
        "".join(
            sorted(map(lambda x: x.upper(), row[column].split(delimiter)))
        ),axis=1)
    else:
        col_new = df.apply(
        lambda row:
        "".join(
            row[column].split(delimiter)
        ),axis=1)
    df_res[column] = col_new
    return df_res

In [410]:
def review_concatter(df, concat="genre"):
    """Append the 'concat' to each word of the review
    """
    if not "review" in df.columns:
        raise ValueError("Need to have a 'review' column")
    df_res = df.copy()
    rev_new = df.apply(
        lambda row:
        " ".join([word+row[concat] for word in row["review"].split()]),
        axis=1)
    df_res["review"] = rev_new
    return df_res