<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Data-cleaning" data-toc-modified-id="Data-cleaning-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data cleaning</a></span></li><li><span><a href="#Load-test-data" data-toc-modified-id="Load-test-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Load test data</a></span></li><li><span><a href="#Bags-of-words" data-toc-modified-id="Bags-of-words-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Bags of words</a></span><ul class="toc-item"><li><span><a href="#Bag-of-Words-for-Movie-Reviews" data-toc-modified-id="Bag-of-Words-for-Movie-Reviews-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Bag-of-Words for Movie Reviews</a></span></li></ul></li><li><span><a href="#Logistic-regression" data-toc-modified-id="Logistic-regression-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Logistic regression</a></span><ul class="toc-item"><li><span><a href="#Model-performance" data-toc-modified-id="Model-performance-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Model performance</a></span></li></ul></li><li><span><a href="#Model-optimization" data-toc-modified-id="Model-optimization-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Model optimization</a></span></li></ul></div>

# Load train data

In [None]:
from sklearn.datasets import load_files

reviews_train = load_files("data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[1]:\n{}".format(text_train[1]))

# Data cleaning

In [None]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

# Load test data

In [None]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

# Bags of words

In [None]:
bards_words =["The fool doth think he is wise,",
"but the wise man knows himself to be a fool"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(bards_words)

In [None]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

In [None]:
#To create the bag-of-words representation for the training data, we call the transform method:
bag_of_words = vect.transform(bards_words)
print("bag_of_words: {}".format(repr(bag_of_words)))

In [None]:
# To look at the actual content of the sparse matrix, we can convert it to
# a "dense" NumPy array (that also stores all the 0 entries) using the toarray method
print("Dense representation of bag_of_words:\n{}".format(
bag_of_words.toarray()))

##  Bag-of-Words for Movie Reviews

In [None]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

In [None]:
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

# Logistic regression

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

## Model performance

In [None]:
X_test = vect.transform(text_test)
print("{:.2f}".format(grid.score(X_test, y_test)))

# Model optimization

In [None]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with min_df: {}".format(repr(X_train)))

In [None]:
feature_names = vect.get_feature_names()
print("First 50 features:\n{}".format(feature_names[:50]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 700th feature:\n{}".format(feature_names[::700]))

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))