# Generate Features from documents

In [179]:
import pandas as pd
import numpy as np
import string
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

### Read Doc Corpus & chop into TFiDF-BOWs

In [189]:
df = pd.read_csv("../data/processed/data_train_test.csv")
df_test, df_train = df[df["set"]=="test"], df[df["set"]=="train"]
train_review = df_train["review"].values
train_lbl = df_train["sentiment"].values
test_review = df_test["review"].values
test_lbl = df_test["sentiment"].values

### Naive Bayes Pipeline

In [50]:
# NAIVE-BAYES classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', MultinomialNB()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None]}]


# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=10,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.3s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.8s finished


In [53]:
nb = _.best_estimator_

In [54]:
nb.score(test_review, test_lbl)

0.8135

# Advanced Shit following

### To be tested
- min/max tfidf
- stemming
- stopwordremoval


# Custom stemmer/tokenizer

In [64]:
# Create a custom Porter Stemmer that suits sklearn
class PortStem(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        return [self.ps.stem(word) for word in word_tokenize(doc)]

In [191]:
# NAIVE-BAYES classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', MultinomialNB()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None, "english"]},
              {'tfidf__tokenizer': [None, PortStem()]}
             ]



# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=2,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    2.3s remaining:    6.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   22.1s finished


In [192]:
nb = _.best_estimator_

In [193]:
nb.score(test_review, test_lbl)

0.8135

# KNN

In [175]:
# KNN classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', KNeighborsClassifier()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None]},
              {'tfidf__tokenizer': [None, PortStem()]},
              {'clf__n_neighbors': [190, 191, 192]},
              {'clf__metric': ['cosine']}
              
             ]



# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=2,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 2 folds for each of 7 candidates, totalling 14 fits


[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:   25.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:   25.4s finished


In [173]:
knn = _.best_estimator_

In [164]:
knn.score(test_review, test_lbl)

0.76149999999999995

# Logistic Regression

In [180]:
# Logistic Regression classification pipeline
pl_clf_jobs_lr = Pipeline([('tfidf', TfidfVectorizer()),
                           ('clf', LogisticRegression()),
                          ])

# Parameter grid for hyper parameter tuning 
param_grid_lr = [{'tfidf__stop_words' : [None, 'english'],
                  'tfidf__tokenizer' : [None, PortStem()],
                  'clf__penalty' : ["l1", "l2"],
                  'clf__C' : [1.0, 10.0, 100.0,]
                 }]

# create grid search
gs_clf_jobs_lr = GridSearchCV(pl_clf_jobs_lr,
                              param_grid=param_grid_lr,
                              cv=2,
                              n_jobs=-1,
                              verbose=True
                              )

# run grid search
_ = gs_clf_jobs_lr.fit(train_review, train_lbl)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.5min finished


In [181]:
lr = _.best_estimator_

In [195]:
lr.score(test_review, test_lbl)

0.85199999999999998

# Where to go from here?
- combine word and genre ("greatHORROR")
- ngrams (e.g. 2-grams for "not bad", etc...)

## Loading tabular IMDB data

In [425]:
df_imdb = pd.read_csv("../data/processed/IMDB.csv", delimiter=";")

In [428]:
df_imdb.head()

Unnamed: 0,﻿response,title,year,rated,released,runtime,genre,director,writer,actors,...,awards,poster,metascore,imdbRating,imdbVotes,imdbID,type,season,episode,seriesID
0,True,The Sealed Room,1909,,02 Sep 1909,11 min,"Short, Drama, Horror",D.W. Griffith,,"Arthur V. Johnson, Marion Leonard, Henry B. Wa...",...,,http://ia.media-imdb.com/images/M/MV5BMDlmYzgw...,,61,442,tt0001032,movie,,,
1,True,Twenty Minutes of Love,1914,,20 Apr 1914,20 min,"Comedy, Short","Joseph Maddern, Charles Chaplin",,"Charles Chaplin, Minta Durfee, Edgar Kennedy, ...",...,,,,59,650,tt0004736,movie,,,
2,True,A Burlesque on Carmen,1915,,18 Dec 1915,31 min,"Short, Comedy",Charles Chaplin,Prosper Mérimée (novel),"Charles Chaplin, Edna Purviance, John Rand, Ja...",...,,,,61,492,tt0005077,movie,,,
3,True,The Pride of the Clan,1917,,07 Jan 1917,84 min,"Drama, Romance",Maurice Tourneur,"Elaine S. Carrington (scenario), Charles E. Wh...","Mary Pickford, Matt Moore, Warren Cook, Kathry...",...,,http://ia.media-imdb.com/images/M/MV5BMTMzMzY3...,,7,41,tt0008458,movie,,,
4,True,The Goat,1921,,15 May 1921,23 min,"Comedy, Short","Buster Keaton, Malcolm St. Clair","Buster Keaton, Malcolm St. Clair","Buster Keaton, Virginia Fox, Joe Roberts, Malc...",...,,http://ia.media-imdb.com/images/M/MV5BMTQzMzQ5...,,78,2213,tt0012224,movie,,,


In [388]:
docs = ["Das ist ein guter Film",
        "Der Film ist dumm",
        "Ich mag Züge"]
genres = ["Horror", "Splatter", "Hentai"]
producer = ["Tarantino", "Spielberg", "DalaiLaima"]

In [389]:
df_test = pd.DataFrame([docs, genres, producer]).T

In [390]:
df_test.columns = ["review", "genre", "producer"]

In [391]:
df_test

Unnamed: 0,review,genre,producer
0,Das ist ein guter Film,Horror,Tarantino
1,Der Film ist dumm,Splatter,Spielberg
2,Ich mag Züge,Hentai,DalaiLaima


In [410]:
def review_concatter(df, concat="genre"):
    """Append the 'concat' to each word of the review
    """
    if not "review" in df.columns:
        raise ValueError("Need to have a 'review' column")
    df_res = df.copy()
    rev_new = df.apply(
        lambda row:
        " ".join([word+row[concat] for word in row["review"].split()]),
        axis=1)
    df_res["review"] = rev_new
    return df_res

In [482]:
def col_concatter(df, column, delim=","):
    """Concat multiple values of one column
    """
    df_res = df.copy()
    col_new = df.apply(
        lambda row:
        "".join(row[column].split(delim)), axis=1)
    df_res[column] = col_new
    return df_res

In [483]:
df_test

Unnamed: 0,review,genre,producer
0,Das ist ein guter Film,Horror,Tarantino
1,Der Film ist dumm,Splatter,Spielberg
2,Ich mag Züge,Hentai,DalaiLaima


In [484]:
col_concatter(df_test, "review", delim=" ")

Unnamed: 0,review,genre,producer
0,DasisteinguterFilm,Horror,Tarantino
1,DerFilmistdumm,Splatter,Spielberg
2,IchmagZüge,Hentai,DalaiLaima


In [417]:
review_concatter(df_test, concat="genre")

Unnamed: 0,review,genre,producer
0,DasHorror istHorror einHorror guterHorror Film...,Horror,Tarantino
1,DerSplatter FilmSplatter istSplatter dummSplatter,Splatter,Spielberg
2,IchHentai magHentai ZügeHentai,Hentai,DalaiLaima


In [424]:
df_imdb.head()

Unnamed: 0,﻿response,title,year,rated,released,runtime,genre,director,writer,actors,...,awards,poster,metascore,imdbRating,imdbVotes,imdbID,type,season,episode,seriesID
0,True,The Sealed Room,1909,,02 Sep 1909,11 min,"Short, Drama, Horror",D.W. Griffith,,"Arthur V. Johnson, Marion Leonard, Henry B. Wa...",...,,http://ia.media-imdb.com/images/M/MV5BMDlmYzgw...,,61,442,tt0001032,movie,,,
1,True,Twenty Minutes of Love,1914,,20 Apr 1914,20 min,"Comedy, Short","Joseph Maddern, Charles Chaplin",,"Charles Chaplin, Minta Durfee, Edgar Kennedy, ...",...,,,,59,650,tt0004736,movie,,,
2,True,A Burlesque on Carmen,1915,,18 Dec 1915,31 min,"Short, Comedy",Charles Chaplin,Prosper Mérimée (novel),"Charles Chaplin, Edna Purviance, John Rand, Ja...",...,,,,61,492,tt0005077,movie,,,
3,True,The Pride of the Clan,1917,,07 Jan 1917,84 min,"Drama, Romance",Maurice Tourneur,"Elaine S. Carrington (scenario), Charles E. Wh...","Mary Pickford, Matt Moore, Warren Cook, Kathry...",...,,http://ia.media-imdb.com/images/M/MV5BMTMzMzY3...,,7,41,tt0008458,movie,,,
4,True,The Goat,1921,,15 May 1921,23 min,"Comedy, Short","Buster Keaton, Malcolm St. Clair","Buster Keaton, Malcolm St. Clair","Buster Keaton, Virginia Fox, Joe Roberts, Malc...",...,,http://ia.media-imdb.com/images/M/MV5BMTQzMzQ5...,,78,2213,tt0012224,movie,,,


In [442]:
df[df["id"] == 1007].iloc[1,2]

"I found it very very difficulty to watch this after the initial 5 minutes of the film. I managed to stomach 45-50 minutes before switching it off in disgust and watching Monster House instead (which, by the way, is great fun).<br /><br />The story has massive holes in it. The plot line is hugely over stated and dull, the acting is awful, especially from Justin TImberlake who should really stick to what he is good at (looking daft and singing like a castrato). Morgan Freeman looked incredibly uncomfortable, especially when made to dance around to rock music for no apparent reason half way through the film after him and Timberlake meet. Freeman and Timberlake's characters seem to be supposed to have some sort of father/son relationship of sorts or something, which simply isn't evident at all apart from the fact that; though Freeman's character seems to have nothing but contempt for the ignorant and rather stupid character of Timberlake, he never the less pulls out all the stops to help 

In [473]:
df_test = pd.read_csv("/Users/joshuagorner/Downloads/TestTrainingSentiments.csv")

CParserError: Error tokenizing data. C error: Expected 8 fields in line 4, saw 17


In [491]:
df_test = pd.read_csv(
    "/Users/joshuagorner/Downloads/TestTrainingSentiments.csv",
    delimiter=";"
)

CParserError: Error tokenizing data. C error: Expected 6 fields in line 24, saw 7


In [497]:
df_test = pd.read_csv("../data/processed/data_train_test_imdb_ids.csv")

In [500]:
df_test.head()

Unnamed: 0,id,imdb_id,rating,review,sentiment,set
0,0,tt0406816,10,I went and saw this movie last night after bei...,pos,test
1,100,tt0406816,10,The finest short I've ever seen. Some commenta...,pos,test
2,101,tt0406816,9,"This is a very, very odd film...one that is so...",pos,test
3,102,tt0406816,8,"Although Bullet In The Brain is, without quest...",pos,test
4,103,tt0406816,10,"...means ""take up and read"", which is precisel...",pos,test
