# Generate Features from documents

In [770]:
import pandas as pd
import numpy as np
import string
import re
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

### Read Doc Corpus & chop into TFiDF-BOWs

In [189]:
df = pd.read_csv("../data/processed/data_train_test.csv")
df_test, df_train = df[df["set"]=="test"], df[df["set"]=="train"]
train_review = df_train["review"].values
train_lbl = df_train["sentiment"].values
test_review = df_test["review"].values
test_lbl = df_test["sentiment"].values

### Naive Bayes Pipeline

In [50]:
# NAIVE-BAYES classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', MultinomialNB()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None]}]


# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=10,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.3s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.8s finished


In [53]:
nb = _.best_estimator_

In [54]:
nb.score(test_review, test_lbl)

0.8135

# Advanced Shit following

### To be tested
- min/max tfidf
- stemming
- stopwordremoval


# Custom stemmer/tokenizer

In [64]:
# Create a custom Porter Stemmer that suits sklearn
class PortStem(object):
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        return [self.ps.stem(word) for word in word_tokenize(doc)]

In [191]:
# NAIVE-BAYES classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', MultinomialNB()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None, "english"]},
              {'tfidf__tokenizer': [None, PortStem()]}
             ]



# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=2,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    2.3s remaining:    6.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   22.1s finished


In [192]:
nb = _.best_estimator_

In [193]:
nb.score(test_review, test_lbl)

0.8135

# KNN

In [175]:
# KNN classification pipeline
pl_clf_jobs = Pipeline([('tfidf', TfidfVectorizer()),
                        ('clf', KNeighborsClassifier()),
                       ])

# Parameter grid for hyper parameter tuning (of preprocessing)
param_grid = [{'tfidf__stop_words' : [None]},
              {'tfidf__tokenizer': [None, PortStem()]},
              {'clf__n_neighbors': [190, 191, 192]},
              {'clf__metric': ['cosine']}
              
             ]



# create grid search
gs_clf_jobs = GridSearchCV(pl_clf_jobs,
                           param_grid=param_grid,
                           cv=2,
                           n_jobs=-1,
                           verbose=True
                          )

# run grid search
_ = gs_clf_jobs.fit(train_review, train_lbl)

Fitting 2 folds for each of 7 candidates, totalling 14 fits


[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:   25.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:   25.4s finished


In [173]:
knn = _.best_estimator_

In [164]:
knn.score(test_review, test_lbl)

0.76149999999999995

# Logistic Regression

In [180]:
# Logistic Regression classification pipeline
pl_clf_jobs_lr = Pipeline([('tfidf', TfidfVectorizer()),
                           ('clf', LogisticRegression()),
                          ])

# Parameter grid for hyper parameter tuning 
param_grid_lr = [{'tfidf__stop_words' : [None, 'english'],
                  'tfidf__tokenizer' : [None, PortStem()],
                  'clf__penalty' : ["l1", "l2"],
                  'clf__C' : [1.0, 10.0, 100.0,]
                 }]

# create grid search
gs_clf_jobs_lr = GridSearchCV(pl_clf_jobs_lr,
                              param_grid=param_grid_lr,
                              cv=2,
                              n_jobs=-1,
                              verbose=True
                              )

# run grid search
_ = gs_clf_jobs_lr.fit(train_review, train_lbl)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.5min finished


In [181]:
lr = _.best_estimator_

In [195]:
lr.score(test_review, test_lbl)

0.85199999999999998

In [781]:
lr.named_steps["clf"].get_params()

{'C': 10.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

# Where to go from here?
- combine word and genre ("greatHORROR") >> get rid of sklearn pipeline to insert the append at the right position
- ngrams (e.g. 2-grams for "not bad", etc...)

## Loading tabular IMDB to merge w/ enhanced test-train-set

In [581]:
df_imdb = pd.read_csv("../data/processed/IMDB.csv", delimiter=";")
df_test_train_en = pd.read_csv("../data/processed/data_train_test_imdb_ids.csv")

## Interesting meta information to be considered (from IMDB)
- genre
- #votings

# Vorgehen
1. preprocessing
2. stop word removal
3. stemming + tokenization
4. concatting with (e.g) genre

In [774]:
# retrieve relevant cols
df_imdb_genres = df_imdb.loc[:, ["imdbID", "genre"]]
# concat genres
df_imdb_genres = col_concatter(df_imdb_genres, column="genre", delimiter=", ")
# rename id col for merging
df_imdb_genres.columns = ["imdb_id", "genre"]
# merge that shit
df_imdb_merged = pd.merge(df_test_train_en, df_imdb_genres, on="imdb_id")
# cleaned reviews
df_imdb_merged_pp = port_stemming(
    stop_word_removal(stopwords.words("english"), 
                      df_imdb_merged
    )
)
# combined review and genre
df_imdb_merged_pp_concat = review_concatter(df_imdb_merged_pp)

In [804]:
# train test split
df_train = \
df_imdb_merged_pp_concat[df_imdb_merged_pp_concat["set"] == "train"]
df_test = \
df_imdb_merged_pp_concat[df_imdb_merged_pp_concat["set"] == "test"]
train_review = df_train["review"].values
train_lbl = df_train["sentiment"].values
test_review = df_test["review"].values
test_lbl = df_test["sentiment"].values

In [812]:
nb = MultinomialNB()

tfidf = TfidfVectorizer()
bow_train = tfidf.fit_transform(train_review)

nb.fit(bow_train, train_lbl)

bow_test = tfidf.transform(test_review)
nb.score(bow_test, test_lbl)

0.5

In [693]:
def preprocessor(text, lowercase=True):
    """Remove HTML-tags and digits
    """
    text = re.sub('<[^>]*>', '', text) # get rid of html tags
    text = re.sub(r'[^a-zA-Z\s]', '', text) # remove any non char
    text = re.sub(' +', ' ', text)
    if lowercase:
        text = text.lower()
    return text

In [760]:
def tokenizing(text):
    """Tokenizing word by word
    """
    return text.split()

In [759]:
def stop_word_removal(sw_list, df, column="review"):
    """Stopword removal including preprocessing
    """
    if not "review" in df.columns:
        raise ValueError("Need to have a 'review' column") 
    df_res = df.copy()  
    rev_new = df.apply(
        lambda row: 
        " ".join([word for word
                  in tokenizing(preprocessor(df[column][0]))
                  if word not in sw_list]), 
        axis=1)
    df_res[column] = rev_new
    return df_res

In [648]:
def port_stemming(df, column="review"):
    """Porter stemming for review column
    """
    if not "review" in df.columns:
        raise ValueError("Need to have a 'review' column")
    ps = PorterStemmer()
    df_res = df.copy()
    rev_new = df.apply(
        lambda row:
        " ".join([ps.stem(word) for word in row["review"].split()]),
        axis=1)
    df_res[column] = rev_new
    return df_res

In [410]:
def review_concatter(df, concat="genre"):
    """Append the 'concat' to each word of the review
    """
    if not "review" in df.columns:
        raise ValueError("Need to have a 'review' column")
    df_res = df.copy()
    rev_new = df.apply(
        lambda row:
        " ".join([word+row[concat] for word in row["review"].split()]),
        axis=1)
    df_res["review"] = rev_new
    return df_res

In [616]:
def col_concatter(df, column, delimiter=",", uppercase=True):
    """Concat multiple values of one column
    """
    df_res = df.copy()
    if uppercase:
        col_new = df.apply(
        lambda row: "" if type(row[column]) != str else \
        "".join(
            sorted(map(lambda x: x.upper(), row[column].split(delimiter)))
        ),axis=1)
    else:
        col_new = df.apply(
        lambda row:
        "".join(
            row[column].split(delimiter)
        ),axis=1)
    df_res[column] = col_new
    return df_res