In [None]:
#All imports are in cell 1 as well as other import start code
#Cell 1 MUST be ran for any other cells to work
import pyprind
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
basepath = 'aclImbd'


labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path,file), 'r', encoding ='utf-6') as infile:
                txt = infile.read()
            df = df.append([[txt,labels[l]]], ignore_index = True)
            pbar.update()
df.columns = ['review', 'sentiment']

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index = False, encoding = 'utf-8')

df = pd.read_csv('movie_data.csv', encoding = 'utf-8')

count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)
print(count.vocabulary_)

In [None]:
tfidf = TfidfTransformer(use_idf = True, norm = '12', smooth_idf = True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())


In [None]:
#Make sure cell 1 has been ran
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=) (?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text
df['review'] = df['review'].apply(preprocessor)

def tokenizer(text):
    return text.split()

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

stop = stopwords.words('english')


In [None]:
x_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
x_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

tfidf = TfidfVectorizer(strip_accents = None,
                       lowercase = False,
                       preprocessor= None)
param_grid = [{'vect_ngram_range': [(1,1)],
              'vect_stop_words': [stop, None],
              'vect_tokenizer': [tokenizer, tokenizerporter],
              'clf_penalty': ['11', '12'],
              'clf_c': [1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state =0))])
gs_lr_tfidf = GridSearchCV(tr_tfidf, param_grid,
                          scoring = 'accuracy',
                          cv = 5, verbose =1,
                          n_jobs =1)
gs_lr_tfidf.fit(x_train, y_train)
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' %gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(x_test, y_test))

In [None]:
#New part from page 21
#can run this cell down 
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=) (?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r', encoding = 'utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range (size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, none
    return docs, y

from sklearn.feature_extraction.text import Hashing Vectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error = 'ignore',
                        n_features = 2**21,
                        preprocessor = None,
                        tokenizer = tokenizer)
clf = SGDClassifer(loss = 'log', random_state = 1, n_iter =1)
doc_stream = stream_docs(path='movie_data.csv')

import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    x_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not x_train:
        break
    x_train = vect.transform(x_train)
    clf.partial_fit(x_train, y_train, classes = classes)
    pbar.update()
x_test, y_test = get_minibatch(doc_stream, size = 5000)
x_test = vect.transform(x_test)
print('Accuracy: %.3f' % clf.score(x_test, y_test))
clf = clf.partial_fit(x_test, y_test)


In [None]:
#another fresh section
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding = 'utf-8')

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words = 'english',
                       max_df = .1,
                       max_features = 5000)
x = count.fit_transform(df['review'].values)
from sklearn.decomposition import LatentDiricheltAllocation
lda = LatentDirichletAllocation(n_topics = 10,
                               random_state = 123,
                               learning_method = 'batch')
x_topics = lda.fit_transform(x)
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d" (topic_idx +1))
    print(" ".join([feature_names[i]
                   for i in topic.argsort()\
                       [:-n_top_words - 1: -1]]))


In [None]:
#MUST RUN PREVIOUS CELL FIRST
horror = x_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx +1))
    print(df['review'][movie_i])