In [1]:
# usual imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook
from sklearn.cross_validation import train_test_split
# Each is a different implemntation of a text transform tool: Bag of Words & Tfidf
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#### Let's first play with Yelp data. Earlier, we performed sentiment analysis on this dataset and were able to achieve 80% accuracy using Random Forest.  Let's check and see if we can beat that with our new tools! For this practice project you shall refer to our earlier codes i.e. [notebook 1](https://github.com/ga-students/SF-DAT-20/blob/master/Code/Lecture13.ipynb) and [notebook 2](https://github.com/ga-students/SF-DAT-20/blob/master/Code/Lecture13-Practice-Solution.ipynb)

In [4]:
# let's load data and put it in a dataframe
rows = []
with open('/Users/benstan/Desktop/GA-DS/SF-DAT-20-MASTER/Data/yelp_labelled.txt') as f:
    for i,line in enumerate(f.readlines()):
        row = (line.split('\n')[0]).split('\t')
        if row[1] == '':
            row[1] = np.nan
        else:
            row[1] = int(row[1])
        rows.append(row)

In [5]:
Yelp_data = pd.DataFrame(rows,columns=['text','sentiment'])
Yelp_data.dropna(inplace = True)
Yelp_data.head()

Unnamed: 0,text,sentiment
0,Wow... Loved this place.,1
3,Crust is not good.,0
4,Not tasty and the texture was just nasty.,0
10,Stopped by during the late May bank holiday of...,1
11,The selection on the menu was great and so wer...,1


#### Split data to 80% training and 20% test set. 

In [6]:
from sklearn.cross_validation import train_test_split
X = Yelp_data.text.copy()
y = Yelp_data.sentiment.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)

#### Here are few libararies we do need from here on

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split

#### Use Pipeline and define CountVectorizer() as 'vect' and MultiNomial Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [15]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])

In [16]:
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,150,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?

In [17]:
gs_clf = GridSearchCV(text_clf, parameters)

In [18]:
fit_grid = gs_clf.fit(X_train,y_train)

In [19]:
fit_grid.score(X_test,y_test)

0.80400000000000005

#### Use Pipeline and define CountVectorizer() as 'vect' and Bernoulli Naive Bayes as your 'clf' - classifier. Then set your parameters to

'vect__min_df':[1,2,3,5,10], 

'vect__max_df':[50,100,150,200,500,1000,1200], 

'clf__alpha':[0,0.1,0.2,0.5,.8,1]


In [20]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])

In [21]:
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,150,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

#### Using GridSearchCV find the best parameters and use it to calculate test error. Did you beat Random Forest?


In [22]:
gs_clf = GridSearchCV(text_clf, parameters)

In [23]:
fit_grid = gs_clf.fit(X_train,y_train)

In [28]:
fit_grid.score(X_test,y_test)

0.78800000000000003

#### What parameters are chosen by GridSearchCV?

In [25]:
fit_grid.best_params_

{'clf__alpha': 0.5, 'vect__max_df': 200, 'vect__min_df': 1}

#### Now it's time for a new dataset! Let's play with SMS dataset. We would like to develop a model by which filter spam/ham text messages. Let's explore this dataset first.

In [29]:
url = "https://raw.githubusercontent.com/ga-students/SF-DAT-20/master/Data/SMSSpamCollection.tsv"
col_names = ['label', 'message']
smsData = pd.read_csv(url, sep='\t', header = 0,names=col_names)
smsData.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
smsData.shape

(5572, 2)

#### Repeat the procedure you applied on Yelp data on SMS data. Can you get better results by using Bernoulli Naive Bayes or MultiNomial Naive Bayes? What is the best score on test set using best tuning parameters?

In [32]:
X = smsData.message.copy()
y = smsData.label.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)

In [33]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB())])

In [34]:
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,150,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

In [35]:
gs_clf = GridSearchCV(text_clf, parameters)

In [36]:
fit_grid = gs_clf.fit(X_train,y_train)

In [37]:
fit_grid.score(X_test,y_test)

0.98277099784637478

Accuracy for multinomial is 98.3%

In [38]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('clf', BernoulliNB())])

In [39]:
parameters = {'vect__min_df':[1,2,3,5,10],
              'vect__max_df':[50,100,150,200,500,1000,1200],
              'clf__alpha':[0,0.1,0.2,0.5,.8,1]}

In [40]:
gs_clf = GridSearchCV(text_clf, parameters)

In [41]:
fit_grid = gs_clf.fit(X_train,y_train)

In [42]:
fit_grid.score(X_test,y_test)

0.98707824838478109

Accuracy for Bernoulli is 98.7%

Answer: Bernoulli was slightly better in this case

#### Print out misclassified instances in your test set. 

In [43]:
X_test[fit_grid.predict(X_test) != y_test]

5449    Latest News! Police station toilet stolen, cop...
3422    Welcome! Please reply with your AGE and GENDER...
2823    ROMCAPspam Everyone around should be respondin...
788     Ever thought about living a good life with a p...
5046    We have sent JD for Customer Service cum Accou...
68      Did you hear about the new "Divorce Barbie"? I...
4527    I want some cock! My hubby's away, I need a re...
1663    Hi if ur lookin 4 saucy daytime fun wiv busty ...
4144    In The Simpsons Movie released in July 2007 na...
5427    Santa Calling! Would your little ones like a c...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
2248    Back 2 work 2morro half term over! Can U C me ...
1458    CLAIRE here am havin borin time & am now alone...
3460    Not heard from U4 a while. Call me now am here...
1638    0A$NETWORKS allow companies to bill for SMS, s...
869     Hello. We need some posh birds and chaps to us...
4249    accordingly. I repeat, just text the word ok o...
5370    dating