In [2]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import nltk, re, numpy as np, pandas as pd, collections
from nltk.corpus import movie_reviews

In [5]:
nltk.download('movie_reviews', quiet=True)

True

In [26]:
fileids = movie_reviews.fileids()
words = [movie_reviews.words(text_id) for text_id in ids]
texts = [' '.join(text) for text in words]
labels = np.array([1 if movie_reviews.categories(text_id)[0] == 'pos' else 0 for text_id in ids])

In [27]:
docs = [re.sub(r"\s+", " ", d.lower()) for d in texts]

In [24]:
print(f"Loaded {len(docs)} docs – {labels.sum()} pos, {len(labels)-labels.sum()} neg")


Loaded 2000 docs – 1000 pos, 1000 neg


In [28]:
#manually define folds for training

def fold_id(fid:str)->int:
    cv_num = int(re.search(r'cv(\d{3})_', fid).group(1))
    return cv_num % 10
fold_idx = np.array([fold_id(fid) for fid in fileids])
# Verify balance
print(collections.Counter(fold_idx))
# Build explicit (train, test) splits list
cv_splits = [(np.where(fold_idx!=k)[0], np.where(fold_idx==k)[0]) for k in range(10)]


Counter({np.int64(0): 200, np.int64(1): 200, np.int64(2): 200, np.int64(3): 200, np.int64(4): 200, np.int64(5): 200, np.int64(6): 200, np.int64(7): 200, np.int64(8): 200, np.int64(9): 200})


In [45]:
vec_presence = CountVectorizer(binary=True, stop_words=None, ngram_range=(1,1))

#Baseline proroduction
models = {
    'NaiveBayes': MultinomialNB(alpha=1.0),
    'MaxEnt'    : LogisticRegression(max_iter=2000, C=1e4, solver='lbfgs'),
    'SVM'       : LinearSVC(C=1, max_iter=5000)
}

baseline_res = {}
for name, clf in models.items():
    pipe   = Pipeline([('vec', vec_presence), ('clf', clf)])
    scores = cross_val_score(pipe, docs, labels, cv=cv_splits, scoring='accuracy', n_jobs=-1)
    baseline_res[name] = (scores.mean(), scores.std())
    print(f"{name:10s}  mean={scores.mean():.3f}  std={scores.std():.3f}")


NaiveBayes  mean=0.821  std=0.034
MaxEnt      mean=0.865  std=0.026
SVM         mean=0.849  std=0.036


In [53]:
#1 improvement with english stopwords

vec_presence  = CountVectorizer(binary=True, stop_words='english', ngram_range=(1,1))

models = {
    'NaiveBayes': MultinomialNB(alpha=1.0),
    'MaxEnt'    : LogisticRegression(max_iter=2000, C=1e4, solver='lbfgs'),
    'SVM'       : LinearSVC(C=1, max_iter=5000)
}

baseline_res = {}
for name, clf in models.items():
    pipe   = Pipeline([('vec', vec_presence), ('clf', clf)])
    scores = cross_val_score(pipe, docs, labels, cv=cv_splits, scoring='accuracy', n_jobs=-1)
    baseline_res[name] = (scores.mean(), scores.std())
    print(f"{name:10s}  mean={scores.mean():.3f}  std={scores.std():.3f}")


NaiveBayes  mean=0.822  std=0.032
MaxEnt      mean=0.862  std=0.021
SVM         mean=0.847  std=0.025


In [54]:
#2 improvement with english stopwords + TF-IDF

vec_presence  = TfidfVectorizer(binary=True, stop_words='english', ngram_range=(1,1))

models = {
    'NaiveBayes': MultinomialNB(alpha=1.0),
    'MaxEnt'    : LogisticRegression(max_iter=2000, C=1e4, solver='lbfgs'),
    'SVM'       : LinearSVC(C=1, max_iter=5000)
}

baseline_res = {}
for name, clf in models.items():
    pipe   = Pipeline([('vec', vec_presence), ('clf', clf)])
    scores = cross_val_score(pipe, docs, labels, cv=cv_splits, scoring='accuracy', n_jobs=-1)
    baseline_res[name] = (scores.mean(), scores.std())
    print(f"{name:10s}  mean={scores.mean():.3f}  std={scores.std():.3f}")


NaiveBayes  mean=0.842  std=0.025
MaxEnt      mean=0.879  std=0.019
SVM         mean=0.877  std=0.022


In [56]:
#3 improvement with english stopwords + TF-IDF + trigrams
vec_presence  = TfidfVectorizer(binary=True, stop_words='english', ngram_range=(1,3))

models = {
    'NaiveBayes': MultinomialNB(alpha=1.0),
    'MaxEnt'    : LogisticRegression(max_iter=2000, C=1, solver='lbfgs'),
    'SVM'       : LinearSVC(C=10, max_iter=5000)
}

baseline_res = {}
for name, clf in models.items():
    pipe   = Pipeline([('vec', vec_presence), ('clf', clf)])
    scores = cross_val_score(pipe, docs, labels, cv=cv_splits, scoring='accuracy', n_jobs=-1)
    baseline_res[name] = (scores.mean(), scores.std())
    print(f"{name:10s}  mean={scores.mean():.3f}  std={scores.std():.3f}")

NaiveBayes  mean=0.841  std=0.020
MaxEnt      mean=0.847  std=0.028
SVM         mean=0.861  std=0.021
