In [5]:
import os
import sys
import tarfile
import time


source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = 'aclImdb_v1.tar.gz'


def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = progress_size / (1024.**2 * duration)
    percent = count * block_size * 100. / total_size
    sys.stdout.write("\r%d%% | %d MB | %.2f MB/s | %d sec elapsed" %
                    (percent, progress_size / (1024.**2), speed, duration))
    sys.stdout.flush()


if not os.path.isdir('aclImdb') and not os.path.isfile('aclImdb_v1.tar.gz'):

    if (sys.version_info < (3, 0)):
        import urllib
        urllib.urlretrieve(source, target, reporthook)

    else:
        import urllib.request
        urllib.request.urlretrieve(source, target, reporthook)

100% | 80 MB | 7.79 MB/s | 10 sec elapsed

In [8]:
if not os.path.isdir('aclImdb'):

    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()

  tar.extractall()


In [9]:
%pip install pyprind



In [10]:
import pyprind
import pandas as pd
import os

# change the `basepath` to the directory of the
# unzipped movie dataset

basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
data_list = [] # Create a list to store the data
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            data_list.append([txt, labels[l]]) # Append to the list
            pbar.update()

# Concatenate the list of data into a DataFrame
df = pd.concat([df, pd.DataFrame(data_list, columns=['review', 'sentiment'])], ignore_index=True)
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01


In [11]:

import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

In [12]:

df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [13]:
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [14]:
df.shape

(50000, 2)

In [15]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [16]:

print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [17]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [19]:

np.set_printoptions(precision=2)

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs))
      .toarray())

[[0.         0.43370786 0.         0.55847784 0.55847784 0.
  0.43370786 0.         0.        ]
 [0.         0.43370786 0.         0.         0.         0.55847784
  0.43370786 0.         0.55847784]
 [0.50238645 0.44507629 0.50238645 0.19103892 0.19103892 0.19103892
  0.29671753 0.25119322 0.19103892]]


In [20]:
tf_is = 3
n_docs = 3
idf_is = np.log((n_docs+1) / (3+1))
tfidf_is = tf_is * (idf_is + 1)
print('tf-idf of term "is" = %.2f' % tfidf_is)

tf-idf of term "is" = 3.00


In [21]:

tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]
raw_tfidf

array([3.39, 3.  , 3.39, 1.29, 1.29, 1.29, 2.  , 1.69, 1.29])

In [22]:
l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
l2_tfidf

array([0.5 , 0.45, 0.5 , 0.19, 0.19, 0.19, 0.3 , 0.25, 0.19])

In [23]:

df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [24]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
  text = (re.sub('[\W]+', ' ', text.lower()) +


In [25]:

preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [26]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [27]:
df['review'] = df['review'].apply(preprocessor)

In [28]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [29]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [30]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [31]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [32]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

**Training a logistic regression model for document classification**

In [33]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [35]:

## "logging" output when this notebook is run
## on the Travis Continuous Integration
## platform to test the code as well as
## speeding up the run using a smaller
## dataset for debugging

if 'TRAVIS' in os.environ:
    gs_lr_tfidf.verbose=2
    X_train = df.loc[:250, 'review'].values
    y_train = df.loc[:250, 'sentiment'].values
    X_test = df.loc[25000:25250, 'review'].values
    y_test = df.loc[25000:25250, 'sentiment'].values

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

1. Why is it important to remove HTML tags and punctuation before modeling?
HTML tags and punctuation add noise and tokens that do not carry semantic meaning for most NLP models. Removing them reduces vocabulary size, avoids spurious tokens (like <br>), and improves signal-to-noise ratio — leading to better, faster models and clearer feature importance.

2. How do emoticons contribute to sentiment analysis?
Emoticons are strong, compact sentiment signals (e.g., :-) positive, :( negative). They can override ambiguous wording and help models detect sentiment when text is neutral or sarcastic. Keeping and extracting emoticons preserves this direct emotional cue.

3. Compare logistic regression performance using different vectorizers.

CountVectorizer (Bag-of-Words): Simple, interpretable counts; works well for frequent discriminative tokens but ignores term importance across corpus.

TfidfVectorizer: Downweights common words and upweights distinctive words — often yields better generalization for sentiment tasks.

HashingVectorizer: Memory-efficient and good for streaming/large data (no fit step), but not invertible — you can’t easily map hashed features back to words. Logistic regression + TF-IDF usually gives best accuracy; Hashing+SGD is better when dataset is too large to fit in memory.

4. What problem does out-of-core learning solve?
Out-of-core learning (incremental learning) lets you train on datasets that are larger than memory by streaming batches from disk and updating the model incrementally (e.g., partial_fit). It solves memory limits and enables training on very large corpora.

5. What do LDA topics represent?
LDA discovers latent topics as probability distributions over words. Each topic represents a theme: words that frequently co-occur across documents. A document is modeled as a mixture of topics. Topics are not labeled — interpret them by inspecting their top words.

6. Visualizations to include (deliverables)

visualizations/tfidf_hist.png — histogram of non-zero TF-IDF values (gives idea of sparsity / value range).

visualizations/topic_top_words.png — top-n words per LDA topic (visual representation of discovered topics).

Optionally: t-SNE / PCA of TF-IDF vectors colored by sentiment to show separability (code below).