In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


###  Create an nlp pipeline to 'Clean' review data

> Load input files and read reviews

> Tokenize

> Stop word removal

> Building a vocab

> Vectorisation 

>Classification


In [3]:
df = pd.read_csv('Desktop/Rnn challenges/Train.csv')

In [4]:
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [5]:
x=df['review'][1]

In [6]:
x

'http://video.google.com/videoplay?docid=211772166650071408&hl=en Distribution was tried.<br /><br />We opted for mass appeal.<br /><br />We want the best possible viewing range so, we forgo profit and continue our manual labor jobs gladly to entertain you for working yours.<br /><br />View Texas tale, please write about it... If you like it or not, if you like Alex or not, if you like Stuie, Texas or Texas tale... Just write about it.<br /><br />Your opinion rules.'

### Nltk

In [7]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import  PorterStemmer

In [8]:
sw = set(stopwords.words('english'))

In [9]:
tokenizer = RegexpTokenizer(r'\w+')

In [10]:
ps = PorterStemmer()

In [11]:
def getStemmedReview(reviews):
    review = reviews.lower()
    review = review.replace('</ br></ br>'," ")
    token = tokenizer.tokenize(review)
    new_tokens = [w for w in token if w not in sw]
    stemmed_tokens = [ps.stem(tok) for tok in new_tokens]
    cleaned = " ".join(stemmed_tokens)
    return cleaned

    

In [12]:
getStemmedReview(x)

'http video googl com videoplay docid 211772166650071408 hl en distribut tri br br opt mass appeal br br want best possibl view rang forgo profit continu manual labor job gladli entertain work br br view texa tale pleas write like like alex like stuie texa texa tale write br br opinion rule'

In [13]:
def getStemmedDocument(inputFile,outputFile):
    out = open(outputFile,'w',encoding='utf8')
    with open(inputFile,encoding='utf8') as f:
        reviews = f.readlines()
    for review in reviews:
        cleaned_review = getStemmedReview(review)
        print((cleaned_review),file=out)
    out.close()
        

In [14]:
x = df['review'].values

In [15]:
print(x.shape)

(40000,)


In [16]:
y = df['label'].values

In [17]:
y

array(['pos', 'pos', 'pos', ..., 'neg', 'pos', 'pos'], dtype=object)

In [20]:
x_test = pd.read_csv('Desktop/Rnn challenges/Test.csv')

In [21]:
x_test =x_test['review'].values

In [22]:
x_test.shape

(10000,)

### 1. Cleaning

In [23]:
def getStemmedReview(reviews):
    review = reviews.lower()
    review = review.replace('</ br></ br>'," ")
    token = tokenizer.tokenize(review)
    new_tokens = [w for w in token if w not in sw]
    stemmed_tokens = [ps.stem(tok) for tok in new_tokens]
    cleaned = " ".join(stemmed_tokens)
    return cleaned

    

In [24]:
x_clean = [getStemmedReview(i) for i in x]

In [25]:
print(x_clean[0])

matur intellig highli charg melodrama unbelivebl film china 1948 wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take


In [103]:
df.shape

(40000, 2)

In [26]:
xt_clean = [getStemmedReview(i) for i in x_test]

In [27]:
print(xt_clean[0])

rememb old kung fu movi use watch friday saturday late night babysitt thought charg well movi play exactli like one movi patsi kensit biggest claim fame love interest mel gibson charact lethal weapon 2 perform one reason never made big terribl actress br br lethal weapon 2 thought cute cute enough check movi includ love music love danc anoth big let obvious impress either attract eye soul scream turn play anoth cheap predict role done badli br br movi kensit star comedienn good one either work club franc cut homeland make ear bleed luck even wors french govern want throw expir visa mayb caught act get marri casanova freiss luck predict begin terribl way give movi neg rate 1 10 star rate


### 2. Vectorization

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
cv = CountVectorizer(ngram_range=(1,2))
x_voc = cv.fit_transform(x_clean)
print(x_voc.shape)

(40000, 2252096)


In [30]:
#Vectorisation on the test set
xt_voc = cv.transform(xt_clean)
print(xt_voc.shape)

(10000, 2252096)


### MultinomialNB

In [31]:
from sklearn.naive_bayes import MultinomialNB

In [32]:
mnb = MultinomialNB()

In [33]:
mnb.fit(x_voc,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
mnb.score(x_voc,y)

0.994325

In [35]:
pred = mnb.predict(xt_voc)

In [36]:
pred.shape

(10000,)

In [37]:
pred

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'neg'], dtype='<U3')

In [38]:
df1 =pd.DataFrame(pred,columns=['label'])

In [39]:
df1

Unnamed: 0,label
0,neg
1,neg
2,neg
3,pos
4,pos
...,...
9995,neg
9996,pos
9997,pos
9998,pos


In [40]:
df1.to_csv('movie.csv')

In [136]:
df2 = pd.read_csv('movie.csv')

In [137]:
df2

Unnamed: 0.1,Unnamed: 0,label
0,0,neg
1,1,neg
2,2,neg
3,3,pos
4,4,pos
...,...,...
9995,9995,neg
9996,9996,pos
9997,9997,pos
9998,9998,pos


In [145]:
df2.rename(columns={'Unnamed: 0':'Id'},inplace=True) 

In [146]:
df2.head()

Unnamed: 0,Id,label
0,0,neg
1,1,neg
2,2,neg
3,3,pos
4,4,pos


In [149]:
df2.to_csv('movie2.csv',index=False)

### Bernoulli Nb

In [151]:
from sklearn.naive_bayes import BernoulliNB

In [152]:
bnb = BernoulliNB()

In [154]:
bnb.fit(x_voc,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [156]:
bnb.score(x_voc,y)

0.99275

In [158]:
pred1=bnb.predict(xt_voc)

In [159]:
df3 =pd.DataFrame(pred,columns=['label'])

In [160]:
df3.to_csv('movie3.csv')

In [161]:
df4 = pd.read_csv('movie3.csv')

In [162]:
df4.head()

Unnamed: 0.1,Unnamed: 0,label
0,0,neg
1,1,neg
2,2,neg
3,3,pos
4,4,pos


In [163]:
df4.rename(columns={'Unnamed: 0':'Id'},inplace=True) 

In [164]:
df4.head()

Unnamed: 0,Id,label
0,0,neg
1,1,neg
2,2,neg
3,3,pos
4,4,pos


In [165]:
df4.to_csv('movie4.csv',index=False)

In [175]:
#tfidf vectoriser
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2))

In [176]:
x_vec = tfidf.fit_transform(x_clean)

In [177]:
x_vec.shape

(40000, 2252096)

In [178]:
xt_vec = tfidf.transform(xt_clean)

In [179]:
xt_vec.shape

(10000, 2252096)

In [180]:
mnb1 = MultinomialNB()

In [181]:
mnb1.fit(x_vec,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [182]:
mnb1.score(x_vec,y)

0.9798

In [256]:
sent = ["#Coding-Blocks is a great source to learn @machine_learning."]

In [261]:
def remove(s):
    
    token = tokenizer.tokenize(s.lower())
    new_tokens = [w for w in token if w not in sw]
    return new_tokens

In [262]:
d =remove(sent)

AttributeError: 'list' object has no attribute 'lower'

In [200]:
print(d)

['coding', 'blocks', 'great', 'source', 'learn', 'machine_learning']


In [205]:
cv = CountVectorizer(tokenizer=remove,ngram_range=(3,3))


In [206]:
x = cv.fit_transform(d).toarray()

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [209]:
def removestop(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [210]:
def myTokenizer(doc):
    words = tokenizer.tokenize(doc.lower())
    
    #Remove stopwords
    words = removestop(words,sw)
    return words

In [211]:
word = myTokenizer(sent)

In [212]:
word

['coding', 'blocks', 'great', 'source', 'learn', 'machine_learning']

In [263]:
cv = CountVectorizer(tokenizer=myTokenizer,ngram_range=(3,3))

In [265]:
x = cv.fit_transform(sent).toarray()

In [268]:
cv.vocabulary_

{'coding blocks great': 1,
 'blocks great source': 0,
 'great source learn': 2,
 'source learn machine_learning': 3}

In [215]:
from nltk import word_tokenize
sent = "Hey! Welcome to Coding Blocks ?."
words = set(word_tokenize(sent))


In [216]:
words


{'!', '.', '?', 'Blocks', 'Coding', 'Hey', 'Welcome', 'to'}

In [218]:
ps.stem('quickly')

'quickli'

In [228]:
h = 'hello ,kishurai0.5@gmail.com.in'

In [229]:
token = RegexpTokenizer('[a-zA-Z0-9._]+@[a-zA-Z]+[.][a-zA-Z]+')

In [230]:
token.tokenize(h)

['kishurai0.5@gmail.com']

In [251]:
cv = CountVectorizer(tokenizer=remove,ngram_range=(2,2))

In [252]:
x = cv.fit_transform(sebt).toarray()

In [240]:
sebt =[ "Coding Blocks is a great source to learn machine learning"]

In [237]:
w = myTokenizer(sebt)

In [239]:
w

['coding', 'blocks', 'great', 'source', 'learn', 'machine', 'learning']

In [253]:
x

array([[1, 1, 1, 1, 1, 1]], dtype=int64)

In [254]:
x.shape

(1, 6)

In [255]:
cv.vocabulary_

{'coding blocks': 1,
 'blocks great': 0,
 'great source': 2,
 'source learn': 5,
 'learn machine': 3,
 'machine learning': 4}