In [19]:
%matplotlib inline
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib as mpl
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
from scipy.io import loadmat
from sklearn.svm import SVC
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import (
    train_test_split, 
    StratifiedKFold, 
    cross_val_score, 
    GridSearchCV,
    learning_curve
)
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier 
from sklearn.naive_bayes import MultinomialNB


In [20]:
df=pd.read_csv("Training.txt",sep="\t", names=['liked','text'],encoding="utf-8");
df.head(3)

Unnamed: 0,liked,text
0,1,India is developing countries
1,1,The Da Vinci Code book is just awesome.
2,1,this was the first clive cussler i've ever rea...


In [21]:
print(len(df))

6931


In [22]:
df.groupby('liked').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
liked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2975,559,I hate Harry Potter.,85
1,3956,744,I love Harry Potter.,167


In [23]:
def tokens(review):
    return TextBlob(review).words

In [24]:
df.head().text.apply(tokens)

0                   [India, is, developing, countries]
1      [The, Da, Vinci, Code, book, is, just, awesome]
2    [this, was, the, first, clive, cussler, i, 've...
3             [i, liked, the, Da, Vinci, Code, a, lot]
4             [i, liked, the, Da, Vinci, Code, a, lot]
Name: text, dtype: object

In [25]:
TextBlob("ready was not a good movie").tags
#nltk.help.upenn_tagset('JJ')

[('ready', 'NN'),
 ('was', 'VBD'),
 ('not', 'RB'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('movie', 'NN')]

In [26]:
def to_lemmas(review):
    wordss = TextBlob(review.lower()).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in wordss]

df.text.head().apply(to_lemmas)

0                     [india, is, developing, country]
1      [the, da, vinci, code, book, is, just, awesome]
2    [this, wa, the, first, clive, cussler, i, 've,...
3             [i, liked, the, da, vinci, code, a, lot]
4             [i, liked, the, da, vinci, code, a, lot]
Name: text, dtype: object

In [27]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
lmtzr.lemmatize('octopi')

'octopus'

In [28]:
bow_transformer = CountVectorizer(analyzer=to_lemmas).fit(df['text'])
print(len(bow_transformer.vocabulary_))

2114


In [29]:
review1=df['text'][3]
print(review1)
#to check 3rd document/review in collection/database

i liked the Da Vinci Code a lot.


In [30]:
bow=bow_transformer.transform([review1])
print(bow)
bow.shape

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 8 stored elements and shape (1, 2114)>
  Coords	Values
  (0, 42)	1
  (0, 369)	1
  (0, 458)	1
  (0, 950)	1
  (0, 1123)	1
  (0, 1152)	1
  (0, 1838)	1
  (0, 1977)	1


(1, 2114)

In [34]:
print(bow_transformer.get_feature_names()[372])
#to check 372nd word in collection

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

In [35]:
review_bow = bow_transformer.transform(df['text'])
print( 'sparse matrix shape:', review_bow.shape)
print('number of non-zeros:', review_bow.nnz) #learn this
print( 'sparsity: %.2f%%' % (100.0 * review_bow.nnz))

sparse matrix shape: (6931, 2114)
number of non-zeros: 71297
sparsity: 7129700.00%


In [36]:
tfidf_transformer =TfidfTransformer().fit(review_bow)
review_tfidf = tfidf_transformer.transform(review_bow)
review_tfidf.shape

(6931, 2114)

In [37]:
text_train, text_test, liked_train, liked_test = train_test_split(df['text'], df['liked'], test_size=0.2)
print(len(text_train), len(text_test), len(text_train) , len(text_test))

5544 1387 5544 1387


In [38]:
pipeline_svm = Pipeline([
    ('bow', CountVectorizer(analyzer=to_lemmas)),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC()),
])

In [39]:
# pipeline parameters to automatically explore and tune
param_svm = [
  {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']},
  {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
]

In [40]:
grid_svm = GridSearchCV(
    pipeline_svm, #object used to fit the data
    param_grid=param_svm, 
    refit=True,  # fit using all data, on the best detected classifier
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores" i.e. to run on all CPUs
    scoring='accuracy',#optimizing parameter
    cv=StratifiedKFold(liked_train, n_folds=5),
)

TypeError: StratifiedKFold.__init__() got an unexpected keyword argument 'n_folds'

In [41]:
%time classifier = grid_svm.fit(text_train, liked_train) # find the best combination from param_svm
print(classifier.grid_scores_)

NameError: name 'grid_svm' is not defined

NameError: name 'classifier' is not defined

In [42]:

print(classification_report(liked_test, classifier.predict(text_test)))

NameError: name 'classifier' is not defined

In [None]:
print(classifier.predict(["the vinci code is awesome"])[0])

In [None]:
print(classifier.predict(["the vinci code is bad"])[0])

In [None]:
def gaussKernel(x1, x2, sigma):
    ss=np.power(sigma,2)
    norm= (x1-x2).T.dot(x1-x2)
    return np.exp(-norm/(2*ss))
x1 = np.array([1, 2, 1])
x2 = np.array([0, 4, -1])
sigma = 2
gaussKernel(x1,x2,sigma)