In [1]:
import sys
sys.path.append('/home/krish.mahajan/Documents/PythonMachineLearning/MLbook') 
%matplotlib notebook

## Extracting the  tar file

In [None]:
import tarfile 
with tarfile.open('/home/krish.mahajan/Documents/PythonMachineLearning/MLbook/data/aclImdb_v1.tar.gz','r:gz') as tar :
    tar.extractall()

In [2]:
import pyprind 
import pandas as pd 
import os 

In [None]:
basepath = '/home/krish.mahajan/Documents/PythonMachineLearning/MLbook/data//aclImdb'

## Preparing the dataset

In [None]:
lables = {'pos':1,'neg':0} 
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test','train'): 
    for l in ('pos','neg'):
        path = os.path.join(basepath,s,l) 
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r',encoding='utf-8') as infile:
                txt = infile.read() 
                df = df.append([[txt,lables[l]]] ,ignore_index = True)
                pbar.update() 
                
df.columns = ['review','sentiment']

## Export the CSV

In [None]:
import numpy as np 
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('/home/krish.mahajan/Documents/PythonMachineLearning/MLbook/data//aclImdb/movie_data.csv',index=False,encoding='utf-8')

In [3]:
df = pd.read_csv('/home/krish.mahajan/Documents/PythonMachineLearning/MLbook/data//aclImdb/movie_data.csv',encoding='utf-8')

In [4]:
df.head(3)

Unnamed: 0,review,sentiment
0,This was the very first kung fu movie that I h...,1
1,This was only the second version of the classi...,0
2,Maiden Voyage is just that. I'd like to say st...,0


## Transforming words into feature vectors

In [5]:
# Term FRequenxy 

import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer 
count = CountVectorizer() 
docs = np.array(['The sun is shinning','The weather is sweet','The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [6]:
print(count.vocabulary_)

{'weather': 7, 'sweet': 5, 'shinning': 3, 'sun': 4, 'shining': 2, 'the': 6, 'and': 0, 'is': 1}


In [7]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0]
 [0 1 0 0 0 1 1 1]
 [1 2 1 0 1 1 2 1]]


In [8]:
# Term Frequency # Inverse Document Frequnecy 
from sklearn.feature_extraction.text import TfidfTransformer 
tfidf = TfidfTransformer(use_idf=True,norm='l2',smooth_idf=True)
np.set_printoptions(precision=2)
print((tfidf.fit_transform(bag)).toarray()) 

[[ 0.    0.39  0.    0.66  0.5   0.    0.39  0.  ]
 [ 0.    0.43  0.    0.    0.    0.56  0.43  0.56]
 [ 0.39  0.46  0.39  0.    0.3   0.3   0.46  0.3 ]]


## Cleaning text data

In [9]:
df.loc[0,'review'][-50:]

'hts and a decent story backround, this is for you!'

In [10]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [11]:
preprocessor(df.loc[0,'review'[-50:]])

'this was the very first kung fu movie that i have ever seen the dubbing is not the greatest but alot better than some that i had seen the plot is much better than some that are made today it is gory at times but that is what gives it that special push academy award material is it not but if you like to watch fights and a decent story backround this is for you '

In [12]:
df.shape

(50000, 2)

In [13]:
## Applying the preprocessor function on whole document 
df['review'] = df['review'].apply(preprocessor)

# Stemming & Tokenization

In [14]:
from nltk.stem.porter import PorterStemmer

In [15]:
porter = PorterStemmer()

In [17]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [19]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

# Stopwords

In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/krish.mahajan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
from nltk.corpus import stopwords 
stop = stopwords.words('english') 
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [29]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


# Training a logistic regression model for document classification

In [24]:
X_train = df.loc[:25000,'review'].values 
y_train = df.loc[:25000,'sentiment'].values 
X_test = df.loc[25000:,'review'].values 
y_test = df.loc[25000:,'sentiment'].values

In [27]:
### GridSearchCV object to find the optimal set of parameters for LogisticRegression using 5-fold stratified cross-validation

from sklearn.model_selection  import GridSearchCV 
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import TfidfVectorizer 


In [32]:
## creating the pipeline

tfidf = TfidfVectorizer(strip_accents = None , lowercase = False, preprocessor = None) 
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect',tfidf) , ('clf',LogisticRegression(random_state=0))]) 

gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid,scoring ='accuracy',cv=5,verbose=1,n_jobs =-1)

In [33]:
## Fitting the model 

gs_lr_tfidf.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 

In [34]:
df.shape

(50000, 2)

In [44]:
df.groupby('sentiment')['sentiment'].count()

sentiment
0    25000
1    25000
Name: sentiment, dtype: int64

## Working with bigger data- online algorithms and out of core learning

In [45]:
import numpy as np 
import re 
from nltk.corpus import stopwords 

In [46]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [52]:
## generator function

def stream_docs(path):
    with open(path,'r',encoding='utf-8') as csv:
        next(csv) 
        for line in csv:
            text,label = line[:-3],int(line[-2]) 
            yield text ,label

In [51]:
next(stream_docs(path='/home/krish.mahajan/Documents/PythonMachineLearning/MLbook/data//aclImdb/movie_data.csv'))

"This was the very first kung fu movie that I have ever seen. The dubbing is not the greatest but alot better than some that I had seen. The plot is much better than some that are made today. It is gory at times but that is what gives it that special push. Academy award material is it not. But if you like to watch fights and a decent story backround, this is for you!",1



('"This was the very first kung fu movie that I have ever seen. The dubbing is not the greatest but alot better than some that I had seen. The plot is much better than some that are made today. It is gory at times but that is what gives it that special push. Academy award material is it not. But if you like to watch fights and a decent story backround, this is for you!"',
 1)

In [53]:
## minibatch function 

def get_minibatch(doc_stream,size):
    docs , y = [] , [] 
    try:
        for _ in range(size):
            text,label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None,None 
    return docs,y

In [54]:
from sklearn.feature_extraction.text import HashingVectorizer 
from sklearn.linear_model import SGDClassifier 


In [70]:
vect = HashingVectorizer(decode_error='ignore' ,n_features=2**21,preprocessor=None,tokenizer=tokenizer)
clf = SGDClassifier(loss='log',random_state=1,n_iter=1)
doc_stream = stream_docs(path='/home/krish.mahajan/Documents/PythonMachineLearning/MLbook/data//aclImdb/movie_data.csv')

In [66]:
import warnings
warnings.filterwarnings("ignore")

In [71]:
import pyprind
pbar = pyprind.ProgBar(15)
classes = np.array([0,1])

for _ in range(25):
    X_train , y_train = get_minibatch(doc_stream,size=1000) 
    if not X_train:  
        print("wrong")
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes) 
    pbar.update() 

0% [################              ] 100% | ETA: 00:00:18

In [72]:
## Testing accuracy on the remaining document 

X_test,y_test = get_minibatch(doc_stream,size=25000) 
X_test = vect.transform(X_test)
print('Accuracy: %.3f' %clf.score(X_test,y_test))

Accuracy: 0.872


In [69]:
## Fitting on the remaining data
clf = clf.partial_fit(X_test,y_test)

## Topic modeling with Latent Dirichlet Allocation

In [73]:
df = pd.read_csv('/home/krish.mahajan/Documents/PythonMachineLearning/MLbook/data//aclImdb/movie_data.csv',encoding='utf-8')

In [74]:
from sklearn.feature_extraction.text import CountVectorizer 
count = CountVectorizer(stop_words='english',max_df =.1,max_features=5000)
X = count.fit_transform(df['review'].values)

In [75]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10,random_state=123,learning_method='batch')
X_topics = lda.fit_transform(X)

In [76]:
lda.components_.shape

(10, 5000)

In [82]:
n_top_words = 5 
feature_names = count.get_feature_names() 
for topic_idx,topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx+1)) 
    print(" ".join([feature_names[i] for i in topic.argsort()[ :-n_top_words-1:-1]]))

Topic 1:
worst minutes awful script stupid
Topic 2:
family mother father children girl
Topic 3:
american war dvd music tv
Topic 4:
human audience cinema art sense
Topic 5:
police guy car dead murder
Topic 6:
horror house sex woman girl
Topic 7:
role performance comedy actor plays
Topic 8:
series episode episodes tv season
Topic 9:
book version original effects fi
Topic 10:
action guy fight guys cool


In [84]:
horror = X_topics[:,5].argsort()[::-1]
for iter_idx , movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' %( iter_idx + 1))
    print(df['review'][movie_idx][:300],'...')
    


Horror movie #1:
Once upon a time in a castle...... Two little girls are playing in the garden's castle. They are sisters. A blonde little girl (Kitty) and a brunette one (Evelyn). Evelyn steals Kitty's doll. Kitty pursues Evelyn. Running through long corridors, they reach the room where their grandfather, sitting o ...

Horror movie #2:
<br /><br />Horror movie time, Japanese style. Uzumaki/Spiral was a total freakfest from start to finish. A fun freakfest at that, but at times it was a tad too reliant on kitsch rather than the horror. The story is difficult to summarize succinctly: a carefree, normal teenage girl starts coming fac ...

Horror movie #3:
Before I talk about the ending of this film I will talk about the plot. Some dude named Gerald breaks his engagement to Kitty and runs off to Craven Castle in Scotland. After several months Kitty and her aunt venture off to Scottland. Arriving at Craven Castle Kitty finds that Gerald has aged and he ...
