In [2]:
from sklearn.datasets import load_files
import os

In [3]:
PATH_TO_DATA = '/Users/mzulliev/education/ods_mlcourse/data/aclImdb'

In [4]:
!du -hs $PATH_TO_DATA

487M	/Users/mzulliev/education/ods_mlcourse/data/aclImdb


In [5]:
!du -hs $PATH_TO_DATA/train
!du -hs $PATH_TO_DATA/test

365M	/Users/mzulliev/education/ods_mlcourse/data/aclImdb/train
121M	/Users/mzulliev/education/ods_mlcourse/data/aclImdb/test


In [6]:
%%time
train_reviews = load_files(os.path.join(PATH_TO_DATA, 'train'), categories=['pos', 'neg'])
test_reviews = load_files(os.path.join(PATH_TO_DATA, 'test'), categories=['pos', 'neg'])

CPU times: user 366 ms, sys: 1.09 s, total: 1.45 s
Wall time: 5.62 s


In [7]:
import numpy as np

In [8]:
print("Train class distribution:", np.bincount(train_reviews.target))
print("Test class distribution:", np.bincount(train_reviews.target))

Train class distribution: [12500 12500]
Test class distribution: [12500 12500]


In [9]:
from bs4 import BeautifulSoup

train_reviews.data = [BeautifulSoup(text, 'html.parser').get_text() for text in train_reviews.data]
test_reviews.data = [BeautifulSoup(text, 'html.parser').get_text() for text in test_reviews.data]

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=50000, stop_words="english", ngram_range=(1, 2))
X_train_sparse = cv.fit_transform(train_reviews.data)
X_test_sparse = cv.transform(test_reviews.data)

print("Shape of training data:", X_train_sparse.shape)
print("Shape of test data:", X_test_sparse.shape)
print("Vocabulary size:", len(cv.vocabulary_))

Shape of training data: (25000, 50000)
Shape of test data: (25000, 50000)
Vocabulary size: 50000


In [11]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score

In [12]:
y_train, y_test = train_reviews.target, test_reviews.target

In [13]:
logit = LogisticRegression(random_state=17, n_jobs=-1, solver="saga", max_iter=1000)
sgd_logit = SGDClassifier(loss="log_loss", learning_rate="optimal", alpha=0.0001, random_state=17, n_jobs=-1, max_iter=10000)

In [14]:
%%time
print("Training LogisticRegression...")
logit.fit(X_train_sparse, y_train)

Training LogisticRegression...
CPU times: user 17 s, sys: 31.1 ms, total: 17 s
Wall time: 17.1 s




In [15]:
%%time
print("Training SGDClassifier...")
sgd_logit.fit(X_train_sparse, y_train)

Training SGDClassifier...
CPU times: user 169 ms, sys: 11.1 ms, total: 180 ms
Wall time: 187 ms


In [16]:
logit_pred = logit.predict(X_test_sparse)
sgd_pred = sgd_logit.predict(X_test_sparse)

print("LogisticRegression accuracy (CountVectorizer):", accuracy_score(y_test, logit_pred))
print("SGDClassifier accuracy (CountVectorizer):", accuracy_score(y_test, sgd_pred))

LogisticRegression accuracy (CountVectorizer): 0.87376
SGDClassifier accuracy (CountVectorizer): 0.86632
