# Importing various libraries

In [6]:
%matplotlib inline
import pandas as pd
import numpy as np
from textblob import TextBlob
import matplotlib as mpl
import matplotlib.pyplot as plt
import csv
import _pickle as cPickle
from scipy.io import loadmat
from sklearn.svm import SVC
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import learning_curve
from sklearn.naive_bayes import MultinomialNB


In [7]:
df=pd.read_csv("Training.txt",sep="\t", names=['liked','text'],encoding="utf-8");
df.head(3)

Unnamed: 0,liked,text
0,1,India is developing countries
1,1,The Da Vinci Code book is just awesome.
2,1,this was the first clive cussler i've ever rea...


In [8]:
print(len(df))

6931


Total no of reviews.

In [9]:
df.groupby('liked').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
liked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,2975,559,I hate Harry Potter.,85
1,3956,744,I love Harry Potter.,167


# Data preprocessing

In [10]:
def tokens(review):
    return TextBlob(review).words

In [11]:
df.head().text.apply(tokens)

0                   [India, is, developing, countries]
1      [The, Da, Vinci, Code, book, is, just, awesome]
2    [this, was, the, first, clive, cussler, i, 've...
3             [i, liked, the, Da, Vinci, Code, a, lot]
4             [i, liked, the, Da, Vinci, Code, a, lot]
Name: text, dtype: object

Function tokens() is created to parse data/review into words.

In [13]:
blob = TextBlob("ready was not a good movie")
#nltk.help.upenn_tagset('JJ')

In [14]:
def to_lemmas(review):
    wordss = TextBlob(review.lower()).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in wordss]

df.text.head().apply(to_lemmas)

0                     [india, is, developing, country]
1      [the, da, vinci, code, book, is, just, awesome]
2    [this, wa, the, first, clive, cussler, i, 've,...
3             [i, liked, the, da, vinci, code, a, lot]
4             [i, liked, the, da, vinci, code, a, lot]
Name: text, dtype: object

In [15]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
lmtzr.lemmatize('octopi')
#nltk

'octopus'

# Converting text data into vectors 

In [16]:
bow_transformer = CountVectorizer(analyzer=to_lemmas).fit(df['text'])
print(len(bow_transformer.vocabulary_))

2114


In [17]:
review1=df['text'][3]
print(review1)
#to check 3rd document/review in collection/database

i liked the Da Vinci Code a lot.


In [18]:
bow=bow_transformer.transform([review1])
print(bow)
bow.shape

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 8 stored elements and shape (1, 2114)>
  Coords	Values
  (0, 42)	1
  (0, 369)	1
  (0, 458)	1
  (0, 950)	1
  (0, 1123)	1
  (0, 1152)	1
  (0, 1838)	1
  (0, 1977)	1


(1, 2114)

In [20]:
print(bow_transformer.get_feature_names_out()[372])

code-other


In [21]:
review_bow = bow_transformer.transform(df['text'])
print( 'sparse matrix shape:', review_bow.shape)
print('number of non-zeros:', review_bow.nnz) #learn this
print( 'sparsity: %.2f%%' % (100.0 * review_bow.nnz))

sparse matrix shape: (6931, 2114)
number of non-zeros: 71297
sparsity: 7129700.00%


# Tf-idf Vectorizer

TF (Term Frequency) berarti frekuensi kemunculan suatu kata dalam sebuah dokumen, sedangkan TF-IDF (Term Frequency-Inverse Document Frequency) adalah hasil perkalian antara frekuensi kata tersebut dengan nilai kebalikannya terhadap jumlah dokumen yang mengandung kata tersebut.

Tujuan dari penggunaan TF-IDF dibandingkan dengan hanya menggunakan CountVectorizer adalah untuk mengurangi pengaruh kata-kata yang terlalu sering muncul dalam keseluruhan korpus (yang biasanya kurang informatif) dan meningkatkan bobot kata-kata yang jarang muncul namun lebih bermakna secara kontekstual.

In [22]:
tfidf_transformer =TfidfTransformer().fit(review_bow)
review_tfidf = tfidf_transformer.transform(review_bow)
review_tfidf.shape

(6931, 2114)

CountVectorizer menghasilkan output berupa frekuensi dari berbagai kata dalam korpus kita. Hasil ini kemudian diteruskan ke metode transform dari TfidfTransformer.

Metode ini berfungsi untuk mengubah matriks frekuensi kata (count matrix) menjadi representasi yang sudah dinormalisasi dalam bentuk TF atau TF-IDF.

In [24]:
text_train, text_test, liked_train, liked_test = train_test_split(df['text'], df['liked'], test_size=0.2)
print(len(text_train), len(text_test), len(text_train) , len(text_test))


5544 1387 5544 1387


Dataset yang telah diunduh kemudian dibagi menjadi data pelatihan dan data pengujian dengan rasio 80:20.

In [25]:
pipeline_svm = Pipeline([
    ('bow', CountVectorizer(analyzer=to_lemmas)),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC()),
])

Pipeline secara berurutan menerapkan serangkaian transformasi dan estimator akhir.
Langkah-langkah di tengah pipeline harus berupa transformasi, yaitu harus memiliki metode fit dan transform.
Sementara estimator akhir hanya perlu memiliki metode fit.

Tujuan dari pipeline adalah untuk menggabungkan beberapa tahapan pemrosesan data dan pelatihan model ke dalam satu kesatuan alur kerja, yang juga dapat divalidasi silang (cross-validation) secara bersamaan sambil menyetel berbagai parameter pada tiap tahapannya.

In [26]:
# pipeline parameters to automatically explore and tune
param_svm = [
  {'classifier__C': [1, 10, 100, 1000], 'classifier__kernel': ['linear']},
  {'classifier__C': [1, 10, 100, 1000], 'classifier__gamma': [0.001, 0.0001], 'classifier__kernel': ['rbf']},
]


SVM

In [28]:
grid_svm = GridSearchCV(
    pipeline_svm,
    param_grid=param_svm,
    refit=True,
    n_jobs=-1,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5),
)

Pencarian menyeluruh (exhaustive search) dilakukan terhadap nilai-nilai parameter tertentu untuk suatu estimator.

CV adalah singkatan dari cross-validation (validasi silang).
Mempelajari parameter dari sebuah fungsi prediksi dan mengujinya pada data yang sama merupakan kesalahan metodologis, karena hal itu akan selalu menghasilkan akurasi 100%. Oleh karena itu, data pelatihan dan data pengujian harus dibedakan.

In [30]:
for mean_score, params in zip(classifier.cv_results_['mean_test_score'], classifier.cv_results_['params']):
    print(f"{mean_score:.3f} for {params}")


0.991 for {'classifier__C': 1, 'classifier__kernel': 'linear'}
0.992 for {'classifier__C': 10, 'classifier__kernel': 'linear'}
0.992 for {'classifier__C': 100, 'classifier__kernel': 'linear'}
0.992 for {'classifier__C': 1000, 'classifier__kernel': 'linear'}
0.568 for {'classifier__C': 1, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}
0.568 for {'classifier__C': 1, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}
0.973 for {'classifier__C': 10, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}
0.568 for {'classifier__C': 10, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}
0.988 for {'classifier__C': 100, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}
0.973 for {'classifier__C': 100, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}
0.992 for {'classifier__C': 1000, 'classifier__gamma': 0.001, 'classifier__kernel': 'rbf'}
0.988 for {'classifier__C': 1000, 'classifier__gamma': 0.0001, 'classifier__kernel': 'rbf'}


In [31]:
print(classification_report(liked_test, classifier.predict(text_test)))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       579
           1       0.99      1.00      0.99       808

    accuracy                           0.99      1387
   macro avg       0.99      0.99      0.99      1387
weighted avg       0.99      0.99      0.99      1387



In [32]:
print(classifier.predict(["the vinci code is awesome"])[0])

1


In [33]:
print(classifier.predict(["the vinci code is bad"])[0])

0


In [34]:
def gaussKernel(x1, x2, sigma):
    ss=np.power(sigma,2)
    norm= (x1-x2).T.dot(x1-x2)
    return np.exp(-norm/(2*ss))
x1 = np.array([1, 2, 1])
x2 = np.array([0, 4, -1])
sigma = 2
gaussKernel(x1,x2,sigma)

0.32465246735834974