In [6]:
# A way to unzip folders
import tarfile
# Acquired the movie dataset :D
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar: tar.extractall()

  with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar: tar.extractall()


In [4]:
# Reading the movie reviews into a pandas DataFrame Object
# Python Progress Indicator (PyPrind)
import pyprind
import pandas as pd
import os

# change the 'basepath' to directory of the
# unzipped movie dataset

basepath = 'aclImdb'

#Define sentiment labels
labels = {'pos': 1, 'neg': 0}
# Initialize progress bar
pbar = pyprind.ProgBar(50000)
#Dataframe pandas
data = []
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file),'r', encoding='utf-8') as infile:
                txt = infile.read()
            # Store as a list
            data.append([txt, labels[l]])
            pbar.update()
# Convert to dataframe once at the end
df = pd.DataFrame(data, columns=['review', 'sentiment'])



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:54


In [5]:
# Shuffle Dataframe using the Permutation Function
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

#Reading the CSV
df = pd.read_csv('movie_data.csv', encoding='utf-8')
# Printing out the first three
df.head(3)


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [6]:
# Checking if DataFrame contains all 50,000 rows:
df.shape

(50000, 2)

#### Bag-Of-Words Model
##### Represents text as numerical feature vectors
##### 1. We create a vocabulary of unique tokens—for example, words—from the entire set of documents.
##### 2. We construct a feature vector from each document that contains the counts of how often each word occurs in the particular document.

In [7]:
# Transforming words into feature vectors
import numpy as np
# CountVectorizer takes an array of text data and constructs the bag-of-words model
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet,'
                 'and one and one is two'])
bag = count.fit_transform(docs)

print(count.vocabulary_)
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


#### Term Frequency-inverse document frequency
##### tf-idf(t,d) = tf(t,d) x idf(t,d)

In [8]:
# TfidfTransformer, takes the raw term frequencies from the CountVectorizer class as input and transforms them into tf-idfs
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs))
      .toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [9]:
# Displaying the last 50 characters from the first document
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [10]:
# Cleaning the Dataset
# Removing the punctuation marks
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + 
            ' '.join(emoticons).replace('-', ''))
    return text

preprocessor(df.loc[0, 'review'[-50:]])
preprocessor("</a>This :) is :(a test:-)!")

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = (re.sub('[\W]+', ' ', text.lower()) +


'this is a test :) :( :)'

In [11]:
# Apply preprocessor function to all the movie reviews in the Dataframe
df['review'] = df['review'].apply(preprocessor)

In [12]:
# Processing documents into tokens
# Splitting the cleaned documents at their whitespace characters
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

##### Word Stemming - which is the process of transforming a word into its roof form
##### Allows the mapping of related words to the same stem

In [13]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer_porter(text):
    return[porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')
# Reduces words to their root form "running" = "run"

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

##### Stop-Word Removal - words that are extremely common in all sorts of text
##### They (probably) bear no useful information that can be used to distinguish between different classes of documents

In [14]:
# Removing Stop-Words from the movie Reviews
# Use the set of 127 (omg) English stop-words
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes'
                             ' running and runs a lot')[-10:]
                             if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JanaE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

In [15]:
# Training a logistic regression model
# Classify the movie reviews into positive and negative

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)
param_grid = [{'vect__ngram_range':[(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
               {'vect__ngram_range':[(1,1)],
                'vect__stop_words': [stop, None],
                'vect__tokenizer': [tokenizer, tokenizer_porter],
                'vect__use_idf':[False],
                'vect__norm':[None],
                'clf__penalty': ['l1', 'l2'],
                'clf__C': [1.0, 10.0, 100.0]}]
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0,
                                                solver='liblinear'))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=2,
                           n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

In [7]:
# Print the best Parameter Set
print('Best parameter set" %s ' % gs_lr_tfidf.best_params_)

NameError: name 'gs_lr_tfidf' is not defined

In [29]:
# Print the CV average Accuracy Scores from the training dataset and test set
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
print('Test Accuracy: %.3f' % clf.score(X_test, y_test)) 

NameError: name 'gs_lr_tfidf' is not defined

##### Out-Of-Core Learning - Allows working with large datasets by fitting the classifier incrementally on smaller batches of a dataset
##### Stochastic gradient descent - An optimization algorithm that updates the model's weight using one example at a time

In [16]:
# Define tokenizer function that cleans the 
# unprocessed text data from movie_data.csv file

import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \
        + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
  text = re.sub('[\W]+', ' ', text.lower()) \


In [17]:
# Define function stream_docs, that reads in and returns one document at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label
# Verify stream_docs
next(stream_docs(path='movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [44]:
# Function get_minibatch, that will take document stream from stream_docs function
# Returns a particular number of documents specified by the size parameter

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [45]:
# HashingVectorizer is data-independent and makes use of the hashing trick
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log_loss', random_state=1)
doc_stream = stream_docs(path='movie_data.csv')


In [46]:
# Out-of-core learning:
import pyprind

pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

for _ in range(45):
    # Each mini-batch consists of 1,000 documents
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    
    if not X_train:  # ✅ Properly indented
        break

    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:19


In [47]:
# Last 5,000 documents to evaluate the performance of the model
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.868


##### Topic modeling describes the board task of assigning topics to unlabeled text documents

In [28]:
# LatenDirichletAllocation class, categorizing the movie review dataset into different topics
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [29]:
# Use CountVectorizer to create the bag-of-words martix as input to the LDA
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english',
                        max_df = .1,
                        max_features = 5000)
X = count.fit_transform(df['review'].values)

In [30]:
# How to fit a LDA estimator to the bag-of-words matrix
# and infer the 10 different topics from the documents
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components = 10,
                                random_state=123,
                                learning_method = 'batch')
# Setting LDA to batch, lets the lda estimator do its estimation based on 
# all available training data in one iteration
X_topics = lda.fit_transform(X)

lda.components_.shape

(10, 5000)

In [32]:
# Analyze the results
n_top_words = 5
feature_names = count.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1 :-1]]))

Topic 1:
horror effects budget special gore
Topic 2:
guy worst money minutes stupid
Topic 3:
version action japanese english match
Topic 4:
book audience human feel documentary
Topic 5:
series tv episode shows episodes
Topic 6:
family woman father mother girl
Topic 7:
music musical role performance song
Topic 8:
war police men murder action
Topic 9:
script comedy role actor performance
Topic 10:
comedy original action watched fan


In [40]:
# Confirm that the categories make sens based on the reviews
# Plot three movies from the horror movie category
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d: ' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')


Horror movie #1: 
I don't know whether this film hits my heart the way it does because of the feelings of friendship, love, closeness to others or the warmth of that transformation Babette's cooking creates, but when the feast starts and for the rest of the movie, I choke up often. <br /><br />Yes, this is a feel-goo ...

Horror movie #2: 
The morbid Catholic writer Gerard Reve (Jeroen Krabbé) that is homosexual, alcoholic and has frequent visions of death is invited to give a lecture in the literature club of Vlissingen. While in the railway station in Amsterdam, he feels a non-corresponded attraction to a handsome man that embarks i ...

Horror movie #3: 
This was just another marvelous film of the Berlin Festival. But unlike "Yes", by Sally Potter, which I had seen some days before, where after leaving the cinema I felt a strong desire of wishing to embrace the whole world and was just happy to be alive, this time quite the opposite thing happened:  ...


In [41]:
# Pickle module
import pickle
import os

# Creating a movieclassifier directory, store the files and data of web application
# pkl_objects is the subdirectory to save the serialized Python objects
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
# First argument the object that is wanted to pickle
# Second argument, provide an open file object that the Python object will be written in
pickle.dump(stop,
            open(os.path.join(dest, 'stopwords.pkl'), 'wb'), # Setting it to binary mode using wb
            protocol=4) # choose the latest and most efficient pickle protocol 
pickle.dump(clf,
            open(os.path.join(dest, 'classifier.pkl'), 'wb'),
            protocol=4)