In [122]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,confusion_matrix

In [92]:
data = pd.read_csv('dataset/train.csv')

In [93]:
data.shape

(38932, 5)

In [94]:
data.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [95]:
data.tail()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
38927,id49253,We arrived late at night and walked in to a ch...,Edge,Desktop,happy
38928,id49254,The only positive impression is location and p...,InternetExplorer,Mobile,not happy
38929,id49255,Traveling with friends for shopping and a show...,Firefox,Mobile,not happy
38930,id49256,The experience was just ok. We paid extra for ...,Chrome,Desktop,not happy
38931,id49257,The Westin is a wonderfully restored grande da...,Mozilla,Desktop,happy


In [96]:
data.sample(2)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
18342,id28668,My girlfriend and I stayed at the BW Pioneer S...,Internet Explorer,Mobile,happy
31871,id42197,Do not stay in this hotel. The room I stayed i...,Edge,Desktop,not happy


In [97]:
data.describe()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
count,38932,38932,38932,38932,38932
unique,38932,38932,11,3,2
top,id39850,Convenient location next to BWI with complimen...,Firefox,Desktop,happy
freq,1,1,7367,15026,26521


In [98]:
data.columns

Index(['User_ID', 'Description', 'Browser_Used', 'Device_Used', 'Is_Response'], dtype='object')

In [99]:
data['Is_Response'].value_counts()

happy        26521
not happy    12411
Name: Is_Response, dtype: int64

In [100]:
data['Is_Response'] = data['Is_Response'].map({'happy' : 'baik', 'not happy' : 'buruk'})

In [101]:
data['Is_Response'].value_counts()

baik     26521
buruk    12411
Name: Is_Response, dtype: int64

In [102]:
X=data.Description
y= data.Is_Response

In [103]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.1, random_state=225)

In [104]:
print('Banyak data x_train :',len(x_train))
print('Banyak data x_test  :',len(x_test))
print('Banyak data y_train :',len(y_train))
print('Banyak data y_test  :',len(y_test))

Banyak data x_train : 35038
Banyak data x_test  : 3894
Banyak data y_train : 35038
Banyak data y_test  : 3894


In [105]:
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combine_pat = r'|'.join((pat1,pat2))
www_pat = r'www.[^ ]+'


stopwrod = ['up','with','to','the','of','in','is']
stopword_user = set(stopwrod)

In [106]:
def proses_teks(teks):
    soup = BeautifulSoup(teks, 'lxml')
    souped = soup.get_text()
    try:
        teks = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        teks = souped
    teks_bersih= re.sub("[^a-zA-Z0-9]", " ",(re.sub(www_pat, '', re.sub(combined_pat, '', teks)).lower()))
    teks_bersih= ' '.join([word for word in teks_bersih.split() if word not in stopword_user])
    return (" ".join([x for x in tok.tokenize(teks_bersih) if len(x) > 1])).strip()

In [82]:
x=[]
for teks in data.Description:
    x.append(proses_teks(teks))

In [107]:
clean_text=pd.DataFrame({'clean_text':x})

In [109]:
data=pd.concat([data,clean_text],axis=1)

In [115]:
#vectorize
cvec=CountVectorizer()
tvec=TfidfVectorizer()
hvec=HashingVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [116]:
#mode
clf1 = RandomForestClassifier()
clf2 = LogisticRegression()
clf3 = BernoulliNB()
clf4 = SVC()

In [119]:
model= Pipeline([('vectorizer',tvec)
                 ,('classifier',clf1)])

model.fit(x_train,y_train)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [120]:
hasil =  model.predict(x_test)

In [133]:
tn, fp, fn, tp = confusion_matrix(hasil,y_test).ravel()
tn, fp, fn, tp

(2462, 792, 109, 531)

In [134]:
confusion_matrix(hasil,y_test)

array([[2462,  792],
       [ 109,  531]], dtype=int64)

In [124]:
accuracy_score(hasil,y_test)

0.768618387262455

In [131]:
confusion_matrix?

[1;31mSignature:[0m [0mconfusion_matrix[0m[1;33m([0m[0my_true[0m[1;33m,[0m [0my_pred[0m[1;33m,[0m [0mlabels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0msample_weight[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute confusion matrix to evaluate the accuracy of a classification

By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` but
predicted to be in group :math:`j`.

Thus in binary classification, the count of true negatives is
:math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
:math:`C_{1,1}` and false positives is :math:`C_{0,1}`.

Read more in the :ref:`User Guide <confusion_matrix>`.

Parameters
----------
y_true : array, shape = [n_samples]
    Ground truth (correct) target values.

y_pred : array, shape = [n_samples]
    Estimated targets as returned by a classifier.

labels : array, shape = [n_classes], optional
