In [1]:
from sklearn.datasets import load_files
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
reviews_train = load_files('aclImdb/train')
reviews_test = load_files('aclImdb/test')

In [3]:
text_train, y_train = reviews_train.data, reviews_train.target
print(type(text_train))
print(len(text_train))
print(text_train[1])

<class 'list'>
25000
b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Executive Decision" should without a doubt be you\'re choice over this one

In [4]:
text_train = [doc.replace(b'<br />', b' ') for doc in text_train]
print('Number of pos/neg reviews: {}'.format(np.bincount(y_train)))
# Balanced dataset

Number of pos/neg reviews: [12500 12500]


In [5]:
text_test, y_test = reviews_test.data, reviews_test.target
print(len(text_test))
print('Number of pos/neg reviews (test): {}'.format(np.bincount(y_test)))
text_test = [doc.replace(b'<br />', b' ') for doc in text_test]

25000
Number of pos/neg reviews (test): [12500 12500]


Bag of words - representation strategy, discards structure in text.

Count how often each word appears in each text in corpus (dataset)

1. Tokenize
2. Build vocabulary
3. Encode

In [6]:
# Example bag of words
bards_words =["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]
vect = CountVectorizer()
vect.fit(bards_words) # tokenize and build vocabulary
print('Vocabulary size: {}'.format(len(vect.vocabulary_)))
print('Vocab content: {}'.format(vect.vocabulary_))

Vocabulary size: 13
Vocab content: {'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


In [7]:
bag_of_words = vect.transform(bards_words) # encode to bag of words representation
bag_of_words.toarray()

array([[0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1],
       [1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1]])

In [8]:
# Bag of words imdb
vect = CountVectorizer()
vect.fit(text_train) # tokenize and build vocab
X_train = vect.transform(text_train) # encode to bag of words representation
print(repr(X_train))

<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>


In [9]:
feature_names = vect.get_feature_names()
print('Number of features: {}\n'.format(len(feature_names)))
print('First 20 features: {}\n'.format(feature_names[:20]))
print('20010 to 20030 features: {}\n'.format(feature_names[20010:20030]))
print('Every 2000th feature: {}'.format(feature_names[::2000]))

Number of features: 74849

First 20 features: ['00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830', '006', '007', '0079', '0080', '0083', '0093638', '00am', '00pm', '00s', '01', '01pm', '02']

20010 to 20030 features: ['dratted', 'draub', 'draught', 'draughts', 'draughtswoman', 'draw', 'drawback', 'drawbacks', 'drawer', 'drawers', 'drawing', 'drawings', 'drawl', 'drawled', 'drawling', 'drawn', 'draws', 'draza', 'dre', 'drea']

Every 2000th feature: ['00', 'aesir', 'aquarian', 'barking', 'blustering', 'bête', 'chicanery', 'condensing', 'cunning', 'detox', 'draper', 'enshrined', 'favorit', 'freezer', 'goldman', 'hasan', 'huitieme', 'intelligible', 'kantrowitz', 'lawful', 'maars', 'megalunged', 'mostey', 'norrland', 'padilla', 'pincher', 'promisingly', 'receptionist', 'rivals', 'schnaas', 'shunning', 'sparse', 'subset', 'temptations', 'treatises', 'unproven', 'walkman', 'xylophonist']


In [10]:
scores = cross_val_score(LogisticRegression(solver='lbfgs', max_iter=100000), X_train, y_train, cv=5)
print('Mean CV accuracy: {:.2f}'.format(np.mean(scores)))

Mean CV accuracy: 0.88


In [11]:
X_test = vect.transform(text_test)
lr = LogisticRegression(solver='lbfgs', max_iter=100000).fit(X_train, y_train)
print('Test score: {:.2f}'.format(lr.score(X_test, y_test)))

Test score: 0.87


In [12]:
# Tune C
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(solver='lbfgs', max_iter=100000), param_grid, cv=5)
grid.fit(X_train, y_train)
print('Best CV score: {:.2f}'.format(grid.best_score_))
print('Best parameters: {}'.format(grid.best_params_))

Best CV score: 0.89
Best parameters: {'C': 0.1}


In [13]:
X_test = vect.transform(text_test)
print('Test score: {:.2f}'.format(grid.score(X_test, y_test)))

Test score: 0.88


In [14]:
# Set minimum number of documents token needs to appear in
vect = CountVectorizer(min_df=5)

# Tokenize and build vocab
vect.fit(text_train)

# Encode
X_train = vect.transform(text_train)
print(repr(X_train))

# Reduced number of features

<25000x27271 sparse matrix of type '<class 'numpy.int64'>'
	with 3354014 stored elements in Compressed Sparse Row format>


In [15]:
feature_names = vect.get_feature_names()
print('Number of features: {}\n'.format(len(feature_names)))
print('First 20 features: {}\n'.format(feature_names[:20]))
print('20010 to 20030 features: {}\n'.format(feature_names[20010:20030]))
print('Every 1000th feature: {}'.format(feature_names[::1000]))

Number of features: 27271

First 20 features: ['00', '000', '007', '00s', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '100', '1000', '100th', '101', '102', '103']

20010 to 20030 features: ['repentance', 'repercussions', 'repertoire', 'repetition', 'repetitions', 'repetitious', 'repetitive', 'rephrase', 'replace', 'replaced', 'replacement', 'replaces', 'replacing', 'replay', 'replayable', 'replayed', 'replaying', 'replays', 'replete', 'replica']

Every 1000th feature: ['00', 'alternatively', 'baked', 'bothersome', 'centipede', 'complicity', 'cutlery', 'disgraceful', 'elton', 'fatal', 'gaining', 'hamburgers', 'ideals', 'ivory', 'leering', 'martin', 'moxy', 'opportunist', 'picasso', 'prudish', 'repartee', 'sas', 'silvers', 'standup', 'talkative', 'trend', 'verisimilitude', 'wreaking']


In [16]:
grid = GridSearchCV(LogisticRegression(solver='lbfgs', max_iter=100000), param_grid, cv=5)
grid.fit(X_train, y_train)
print('Best CV score: {:.2f}'.format(grid.best_score_))
print('Best parameters: {}'.format(grid.best_params_))

# Less number of features didn't imrpove accuracy

Best CV score: 0.89
Best parameters: {'C': 0.1}
