In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
import glob
import requests
import tarfile
import tqdm
import os

## From: Python Machine Learning by Raschka and Mirjalili

---

## Get the data

In [2]:
imdb_url = 'http://ai.stanford.edu/~amaas/data/sentiment/'
imdb_tar = 'aclImdb_v1.tar.gz'
data_base = '../data/'
imdb_base = 'aclImdb'
imdb_csv = 'acl_imdb_data.csv'

if not glob.glob(os.path.join(data_base,imdb_tar)):
    req = requests.get(imdb_url+imdb_tar)
    open('../data/'+imdb_tar,'wb').write(req.content)

if not glob.glob(os.path.join(data_base,imdb_base)):
    with tarfile.open(os.path.join(data_base,imdb_tar),'r:gz') as tar:
            tar.extractall(path=data_base)
if not glob.glob(os.path.join(data_base,imdb_csv)):
    labels = {'pos':1,'neg':0}
    df = pd.DataFrame()
    with tqdm.tqdm(total=50000) as pbar:
        for s in ('test','train'):
            for l in ('pos','neg'):
                path = os.path.join(data_base,imdb_base,s,l)
                for file in sorted(os.listdir(path)):
                    with open(os.path.join(path,file),
                              'r', encoding='utf-8') as infile:
                        txt = infile.read()
                    df = df.append([[txt, labels[l]]],ignore_index=True)
                    pbar.update()
    df.columns = ['review','sentiment']
    
    np.random.seed(0)
    df = df.reindex(np.random.permutation(df.index))
    df.to_csv('../data/acl_imdb_data.csv',index=False,encoding='utf-8')

In [3]:
df = pd.read_csv('../data/acl_imdb_data.csv',encoding='utf-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null int64
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [4]:
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


---

## Transform Data

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

countvect = CountVectorizer()
X = countvect.fit_transform(df.loc[:,'review'].iloc[-3:])

In [6]:
df.loc[0,'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [7]:
countvect.vocabulary_

{'don': 40,
 'even': 44,
 'know': 79,
 'where': 157,
 'to': 143,
 'begin': 18,
 'on': 102,
 'this': 142,
 'one': 103,
 'it': 74,
 'all': 8,
 'about': 0,
 'the': 139,
 'family': 50,
 'that': 138,
 'has': 59,
 'be': 15,
 'worst': 162,
 'line': 86,
 'of': 101,
 'dialogue': 34,
 'ever': 45,
 'heard': 62,
 'in': 71,
 'horror': 67,
 'movie': 96,
 'although': 10,
 'couldn': 30,
 'if': 70,
 'tried': 146,
 'ugh': 148,
 'and': 12,
 'owen': 106,
 'wilson': 161,
 'is': 73,
 'better': 19,
 'actor': 4,
 'he': 61,
 'needs': 98,
 'stop': 131,
 'playing': 108,
 'token': 144,
 'guy': 56,
 'who': 158,
 'dies': 36,
 'every': 46,
 'action': 3,
 'anaconda': 11,
 'armageddon': 14,
 'after': 6,
 'man': 91,
 'did': 35,
 'co': 28,
 'write': 163,
 'bottle': 21,
 'rocket': 116,
 'rushmore': 118,
 'does': 38,
 'have': 60,
 'some': 125,
 'talent': 135,
 'also': 9,
 'lily': 85,
 'taylor': 136,
 'should': 123,
 'stick': 129,
 'indie': 72,
 'films': 51,
 'she': 122,
 'no': 99,
 'place': 107,
 'here': 64,
 'finally': 5

In [8]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>','',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text) # see https://regex101.com/
    text = (re.sub('[\W]+',' ',text.lower()) + ' '.join(emoticons).replace('-',''))
    return text

In [9]:
preprocessor(df.loc[0,'review'][-50:])

'is seven title brazil not available'

In [10]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [11]:
#df['review'] = df.review.apply(preprocessor)

In [12]:
def tokenizer(text):
    return text.split()

In [13]:
tokenizer(preprocessor(df.loc[1,'review'][:52]))

['ok', 'so', 'i', 'really', 'like', 'kris', 'kristofferson', 'and', 'his']

In [14]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [15]:
tokenizer_porter(preprocessor(df.loc[1,'review'][:52]))

['ok', 'so', 'i', 'realli', 'like', 'kri', 'kristofferson', 'and', 'hi']

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bgibson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [18]:
[w for w in tokenizer_porter(preprocessor(df.loc[1,'review'][:52])) if w not in stop]

['ok', 'realli', 'like', 'kri', 'kristofferson', 'hi']

---

## Train Model

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train,X_test,y_train,y_test = train_test_split(df.review,df.sentiment)

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                         lowercase=False,
                         preprocessor=None)

lr_tfidf = Pipeline([('vect',tfidf),
                     ('clf',LogisticRegression(random_state=0,solver='liblinear'))])

In [None]:
param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop,None],
               'vect__tokenizer': [tokenizer,tokenizer_porter],
               'clf__penalty': ['l1','l2'],
               'clf__C':[1.0,10.0,100.0],},
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop,None],
               'vect__tokenizer': [tokenizer,tokenizer_porter],
               'vect__use_idf': [False],
               'vect__norm': [None],
               'clf__penalty':['l1','l2'],
               'clf__C': [1.0,10.0,100.0]
              }
             ]

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',cv=5, verbose=2, n_jobs=-1)
gs_lr_tfidf.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.2min


In [None]:
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')
print(f'CV Accuracy: {gs_lr_tfidf.best_score_}:0.3f')

In [None]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: {clf.score(X_test, y_test):0.3f}')