In [131]:
import os
import re
import gc
from functools import reduce

import numpy as np
import nltk
import pandas as pd
import matplotlib.pyplot as plt

from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.metrics import roc_auc_score, classification_report, make_scorer

In [76]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/vagrant/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
def read_files(dir_path, label):
    raw = []
    paths = os.listdir(dir_path)
    num_paths = len(paths)
    for i in tqdm(range(num_paths)):
        with open(os.path.join(dir_path, paths[i]), 'r') as file:
            raw.append(file.read())
    
    return pd.DataFrame(list(zip(raw, [label] * len(raw))), columns=['text', 'label'])

In [61]:
pos_train_df = read_files('./aclImdb/train/pos', 1)
neg_train_df = read_files('./aclImdb/train/neg', 0)

pos_test_df = read_files('./aclImdb/test/pos', 1)
neg_test_df = read_files('./aclImdb/test/neg', 0)

train_df = pd.concat([pos_train_df, neg_train_df])
test_df = pd.concat([pos_test_df, neg_test_df])

train_df.index = pd.RangeIndex(25000)
test_df.index = pd.RangeIndex(25000)

100%|██████████| 12500/12500 [00:17<00:00, 728.42it/s]
100%|██████████| 12500/12500 [00:16<00:00, 758.97it/s]
100%|██████████| 12500/12500 [00:16<00:00, 771.78it/s]
100%|██████████| 12500/12500 [00:16<00:00, 750.58it/s]


In [32]:
del pos_train_df, neg_train_df, pos_test_df, neg_test_df
gc.collect()

46

In [113]:
def evaluate_model(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    predictions = model.predict_proba(X_test)[:, 1]
    print(classification_report(y_test, predictions > 0.5))
    print(roc_auc_score(y_test, predictions))


def apply_bag_of_words(train, test, max_features=500):
    bag_of_words = CountVectorizer(input='content', lowercase=True, token_pattern=regexp, analyzer='word', stop_words='english', max_features=max_features)
    return bag_of_words.fit_transform(train), bag_of_words.transform(test)

# Naive Bayes + Bag of words

In [114]:
bag_train, bag_test = apply_bag_of_words(train_df['text'], test_df['text'])
model = ComplementNB(alpha=2)
evaluate_model(bag_train, train_df['label'], bag_test, test_df['label'], model)

              precision    recall  f1-score   support

           0       0.82      0.80      0.81     12500
           1       0.81      0.82      0.81     12500

   micro avg       0.81      0.81      0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

0.8849618048


### Let's add stemming

In [115]:
stemmer = WordNetLemmatizer()

stemed_train = train_df['text'].apply(lambda s: stemmer.lemmatize(s))
stemed_test = test_df['text'].apply(lambda s: stemmer.lemmatize(s))

In [116]:
bag_train, bag_test = apply_bag_of_words(stemed_train, stemed_test)
model = ComplementNB(alpha=2)
evaluate_model(bag_train, train_df['label'], bag_test, test_df['label'], model)

              precision    recall  f1-score   support

           0       0.82      0.80      0.81     12500
           1       0.81      0.82      0.81     12500

   micro avg       0.81      0.81      0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000

0.8849618048


## Logistic Regression + Bag of words

In [118]:
bag_train, bag_test = apply_bag_of_words(train_df['text'], test_df['text'])
model = LogisticRegression()
evaluate_model(bag_train, train_df['label'], bag_test, test_df['label'], model)



              precision    recall  f1-score   support

           0       0.85      0.83      0.84     12500
           1       0.83      0.85      0.84     12500

   micro avg       0.84      0.84      0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

0.9125315519999999


Let's use GridSearch to improve the result

In [125]:
params = {
    'max_iter': [50, 100, 150, 200],
    'solver': ['saga', 'sag', 'newton-cg', 'liblinear', 'lbfgs'],
    'C': [0.5, 0.75, 1],
    'penalty': ['l2']
}

In [132]:
grid = GridSearchCV(model, params, scoring=make_scorer(roc_auc_score))
grid.fit(bag_train, train_df['label'])







GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_iter': [50, 100, 150, 200], 'solver': ['saga', 'sag', 'newton-cg', 'liblinear', 'lbfgs'], 'C': [0.5, 0.75, 1], 'penalty': ['l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

In [134]:
grid.best_estimator_

LogisticRegression(C=0.75, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [133]:
evaluate_model(bag_train, train_df['label'], bag_test, test_df['label'], grid.best_estimator_)

              precision    recall  f1-score   support

           0       0.85      0.82      0.84     12500
           1       0.83      0.85      0.84     12500

   micro avg       0.84      0.84      0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

0.9130456767999999
