# Solutions V: Bag of Words

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load the full IMDB review dataset.
reviews = pd.read_csv("../../0_data/imdb/imdb_reviews_small.csv", compression="zip")
reviews.shape

In [3]:
# Construct X, y for the training set.
X_train = reviews.loc[reviews["dataset"] == "Train"]
y_train = X_train["label"]
X_train = X_train["content"]

# Construct X, y for the test set.
X_test = reviews.loc[reviews["dataset"] == "Test"]
y_test = X_test["label"]
X_test = X_test["content"]

In [None]:
# Label distribution in train set.
y_train.value_counts()

In [None]:
# Label distribution in test set.
y_test.value_counts()

### Vectorize using word counts

In [None]:
# Create and fit a CountVectorizer to transform the data.
vectorizer = CountVectorizer(lowercase=True)
vectorizer.fit(X_train)

In [None]:
# Number of words in the vocabulary (= model features).
len(vectorizer.vocabulary_)

In [None]:
# Reduce the vocalucary using a minimal frequency, stopwords.
vectorizer = CountVectorizer(lowercase=True, stop_words="english", min_df=25)
vectorizer.fit(X_train)

In [None]:
# Large reduction in vocabulary!
len(vectorizer.vocabulary_)

In [10]:
# Transform the training and test datasets.
Xt_train = vectorizer.transform(X_train)
Xt_test = vectorizer.transform(X_test)

### Bag of words model

In [None]:
# Create and fit a RandomForest classifier.
model = RandomForestClassifier(n_jobs=-1)
model.fit(Xt_train, y_train)

In [None]:
# Accuracy on the train set.
accuracy_score(y_train, model.predict(Xt_train))

In [None]:
# Accuracy on the test set.
accuracy_score(y_test, model.predict(Xt_test))

### TF-IDF Vectorizer

In [14]:
# Set up TF-IDF vectorizer.
tfidf = TfidfVectorizer(lowercase=True, stop_words="english", min_df=25)

In [15]:
# Convert the data.
Xt_train = tfidf.fit_transform(X_train)
Xt_test = tfidf.transform(X_test)

In [None]:
# Create and fit a RandomForest classifier.
# model = RandomForestClassifier(n_jobs=-1)
model.fit(Xt_train, y_train)

In [None]:
# Accuracy on the test set.
accuracy_score(y_test, model.predict(Xt_test))