In [21]:
from __future__ import division
import pandas as pd
import numpy as np

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kiraliang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [22]:
data_path = "../conversations/dsat/voip-0f41c16f2f-20130325_192310.txt"

In [23]:
with open (data_path) as f:
    data = f.read()

In [24]:
print(data)

[SYSTEM] Hello , welcome to the Cambridge restaurant system? You can ask for restaurants by area , price range or food type . How may I help you?
[USER]   i'm looking for a restaurant that serves german food
[SYSTEM] You are looking for a german restaurant right?
[USER]   no
[SYSTEM] Let me confirm , You are looking for a restaurant in the moderate price range right?
[USER]   yes
[SYSTEM] I'm sorry but there is no restaurant serving german food
[USER]   how about free
[SYSTEM] I'm sorry but there is no restaurant serving german food
[USER]   a restaurant serving french food
[SYSTEM] You are looking for a german restaurant right?
[USER]   thank you goodbye



### function definition

In [25]:
def read_file(file):
    with open (file) as f:
        lines = [line.rstrip('\n') for line in f]
    id_name = file.split("/")[-1].split(".txt")[0]
    return id_name,lines

In [26]:
def number_repetition(lines):
    new_list =[]
    for line in lines:
        if "SYSTEM" in line:
            new_list.append(line.strip())
    #create dictionary and count the repetition of sentence (Feature 1)
    new_dict = {}
    for li in new_list:
        if li in new_dict:
            new_dict[li] += 1
        else:
            new_dict[li] = 1
    num_rep = sum([value for value in new_dict.values() if value > 1])
    #Percentage of the repetition sentence in the entire conversation (Feature 3)
    num_rep_per = num_rep/len(new_list)
    
    return num_rep,num_rep_per, len(lines)



In [27]:
def pos_neg_conv(file):
    # get everything
    
    dict_conv_only = create_user_conv(file)
    compound_score, tot_pos_sen, tot_neg_sen = get_sentiment(dict_conv_only)
    return compound_score, tot_pos_sen, tot_neg_sen
    
    
def get_sentiment(dict_conv_only):
    # generate the compound sentiment score and store in conversation

    for sentence in dict_conv_only.keys():
        compound_sentence = generatesentiment(sentence)
        # add compound score to sentence 
        dict_conv_only[sentence] = compound_sentence
    # add total compound score of sentence to conversation
    compound_score = sum(dict_conv_only.values())/len(dict_conv_only.values())
    tot_pos_sen = len([x for x in dict_conv_only.values() if x > 0])/len(dict_conv_only.values())
    tot_neg_sen = len([x for x in dict_conv_only.values() if x < 0])/len(dict_conv_only.values())
    return compound_score, tot_pos_sen, tot_neg_sen
    
    
def create_user_conv(lines):
    #create a list per conversation of only USER sentences 
    # and 'thankyou goodbye left out'
    conv_only_user = []
    new_list =[]
    for line in lines:
        # only USER input
        if "USER" in line:
            # Remove the word '[USER]'
            line = line.replace("[USER]", "")
            line = line.strip()
            new_list.append(line)
    # Remove 'thank you good bye' if in last sentences
    for i,j in dic.items():
        new_list[-1] = new_list[-1].replace(i, j)
        new_list[-2] = new_list[-2].replace(i, j)
    conv_only_user.append(new_list)

    # convert to dict 
    # with every conversation a dict with sentences as keys
    # and compound pos neg score (as 0)
    for conv in conv_only_user:
        new_dict2 = {}
        for sentence in conv:
            new_dict2[sentence] = 0
    return new_dict2       
        
def generatesentiment(sentence):
    # function for sentiment analysis on sentences
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence)
    return ss['compound']

#### Repetition of the chatbot / 4 features

In [226]:
# def number_repetition(file):
#     with open (file) as f:
#         lines = f.readlines()
#         new_list =[]
#         for line in lines:
#             if "SYSTEM" in line:
#                 new_list.append(line.strip())
#     #create dictionary and count the repetition of sentence (Feature 1)
#     new_dict = {}
#     for li in new_list:
#         if li in new_dict:
#             new_dict[li] += 1
#         else:
#             new_dict[li] = 1
#     num_rep = sum([value for value in new_dict.values() if value > 1])
    
#     #create a matrix to calculate repetition (Feature 2)
#     m = [[0]* len(new_list) for i in range(len(new_list))]
#     for i, x in enumerate(new_list):
#         for j, y in enumerate(new_list):
#             if x == y:
#                 m[i][j] = 1 
#     all_value = 0
#     for li in m:
#         a = sum(li)
#         all_value+=a
#     num_rep_exp = (all_value-len(new_list))/2
    
#     #Percentage of the repetition sentence in the entire conversation (Feature 3)
#     num_rep_per = num_rep/len(new_list)
#     #Number of sentences in the conversation (Feature 4)
#     len_conversation = len(lines)
    
#     return num_rep, num_rep_per,num_rep_exp,len_conversation

In [28]:
import glob
text_files_dsat = glob.glob("../conversations/dsat/*.txt")
text_files_sat = glob.glob("../conversations/sat/*.txt")

In [29]:
# sentences to delete
dic = {'thankyou':"", 'thank you':"", "goodbye":"", "good bye":"", 'bye':""}

In [30]:
with open(text_files_dsat[6]) as f:
    print(f.read())

[SYSTEM] Hello , welcome to the Cambridge restaurant system? You can ask for restaurants by area , price range or food type . How may I help you?
[USER]   yes can i have a restaurant who
[SYSTEM] What kind of food would you like?
[USER]   cheap
[SYSTEM] What kind of food would you like?
[USER]   european
[SYSTEM] Sorry there is no european restaurant in the cheap price range
[USER]   european food any price
[SYSTEM] Sorry there is no european restaurant in the cheap price range
[USER]   okay can i have european food
[SYSTEM] Sorry there is no european restaurant in the cheap price range
[USER]   any price range
[SYSTEM] Sorry there is no european restaurant in the cheap price range
[USER]   uh european food
[SYSTEM] Sorry there is no european restaurant in the cheap price range
[USER]   can i have european food
[SYSTEM] Sorry there is no european restaurant in the cheap price range
[USER]   can i have the in any price range
[SYSTEM] Sorry there is no european restaurant in the cheap pr

#### create feature dataframe for disatisfied dataset

In [31]:
columns = ['id','num_rep','num_rep_per','len_conversation','total_compound_conv', 'tot_pos_sen', 'tot_neg_sen', 'Is_satisfied']
df_rep_dsat = pd.DataFrame(index=range(len(text_files_dsat)), columns=columns)

In [32]:
for idx,file in enumerate(text_files_dsat):
    id_number,lines = read_file(file)
    num_rep, num_rep_per,len_conversation = number_repetition(lines)
    compound_score, tot_pos_sen, tot_neg_sen = pos_neg_conv(lines)
    df_rep_dsat.iloc[idx] = pd.Series({'id':id_number, 'num_rep':num_rep, 'num_rep_per':num_rep_per,
                                  'len_conversation':len_conversation,
                                       'total_compound_conv':compound_score, 'tot_pos_sen':tot_pos_sen, 
                                       'tot_neg_sen':tot_neg_sen, 'Is_satisfied':0})

In [33]:
df_rep_dsat.head()

Unnamed: 0,id,num_rep,num_rep_per,len_conversation,total_compound_conv,tot_pos_sen,tot_neg_sen,Is_satisfied
0,voip-dda7c88c6e-20130323_053612,2,0.142857,28,-0.0134429,0.142857,0.142857,0
1,voip-31de0daa7b-20130401_204621,11,0.55,40,0.0,0.0,0.0,0
2,voip-9819537952-20130327_023510,5,0.625,16,0.0206286,0.142857,0.142857,0
3,voip-e61fa89add-20130326_004919,0,0.0,6,0.0754333,0.333333,0.0,0
4,voip-7e07d8f0f5-20130328_190516,8,0.571429,28,0.0557667,0.25,0.0,0


#### create feature dataframe for satisfied dataset

In [34]:
columns = ['id','num_rep','num_rep_per','len_conversation','total_compound_conv', 'tot_pos_sen', 'tot_neg_sen', 'Is_satisfied']
df_rep_sat = pd.DataFrame(index=range(len(text_files_sat)), columns=columns)

In [35]:
for idx,file in enumerate(text_files_sat):
    index,lines = read_file(file)
    num_rep, num_rep_per,len_conversation = number_repetition(lines)
    compound_score, tot_pos_sen, tot_neg_sen = pos_neg_conv(lines)
    df_rep_sat.iloc[idx] = pd.Series({'id':index,'num_rep':num_rep, 'num_rep_per':num_rep_per,
                                  'len_conversation':len_conversation,
                                       'total_compound_conv':compound_score, 'tot_pos_sen':tot_pos_sen, 
                                       'tot_neg_sen':tot_neg_sen, 'Is_satisfied':1})

In [36]:
df_rep_sat.head()

Unnamed: 0,id,num_rep,num_rep_per,len_conversation,total_compound_conv,tot_pos_sen,tot_neg_sen,Is_satisfied
0,voip-22c938c8ba-20130325_130445,8,0.571429,28,-0.0104083,0.25,0.25,1
1,voip-9f989824fd-20130325_204229,0,0.0,16,0.07985,0.333333,0.0,1
2,voip-bde2721237-20130326_200505,0,0.0,10,0.01544,0.2,0.0,1
3,voip-a7ddefaeb3-20130328_173142,18,0.72,50,0.121847,0.368421,0.0526316,1
4,voip-db80a9e6df-20130328_230211,0,0.0,12,0.0505833,0.333333,0.0,1


In [37]:
print('Number of disatisfied files: {}'.format(len(df_rep_dsat)))
print('Number of satisfied files: {}'.format(len(df_rep_sat)))

Number of disatisfied files: 157
Number of satisfied files: 314


In [38]:
len(df_rep_dsat)/(len(df_rep_dsat)+len(df_rep_sat))

0.3333333333333333

In [39]:
len(df_rep_sat)/(len(df_rep_dsat)+len(df_rep_sat))

0.6666666666666666

#### concat 2 dataframe

In [40]:
new_df = pd.concat([df_rep_dsat,df_rep_sat])

#### read new dataset

In [2]:
import pandas as pd
features = pd.read_csv("features_tfidf.csv")

In [3]:
features.head()

Unnamed: 0,id,num_rep,num_rep_per,len_conversation,total_compound_conv,tot_pos_sen,tot_neg_sen,sum_tfidf,avg_tfidf,Is_satisfied
0,voip-dda7c88c6e-20130323_053612,2,0.142857,28,-0.013443,0.142857,0.142857,5.799495,0.087871,0
1,voip-31de0daa7b-20130401_204621,11,0.55,40,0.0,0.0,0.0,3.623657,0.068371,0
2,voip-9819537952-20130327_023510,5,0.625,16,0.020629,0.142857,0.142857,2.821653,0.100773,0
3,voip-e61fa89add-20130326_004919,0,0.0,6,0.075433,0.333333,0.0,3.31147,0.122647,0
4,voip-7e07d8f0f5-20130328_190516,8,0.571429,28,0.055767,0.25,0.0,4.426133,0.085118,0


#### oversampling - SMOTE

In [19]:
features['Is_satisfied'].value_counts()

1    314
0    157
Name: Is_satisfied, dtype: int64

In [30]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
X = features.iloc[:,1:-1]
y = features.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [31]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

In [32]:
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

After OverSampling, counts of label '1': 234
After OverSampling, counts of label '0': 234


#### train the data with oversampling

#### Logistic regression

In [33]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train_res, y_train_res)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
y_pred = clf.predict(X_test)

In [35]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_pred, average='binary')

(0.8611111111111112, 0.775, 0.8157894736842106, None)

In [36]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7627118644067796

In [65]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[28 10]
 [18 62]]


#### train the data without oversampling

In [45]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
y_pred_new = classifier.predict(X_test)



In [10]:
#y_pred_new = classifier.predict_proba(X_test)
#y_pred_new2 = classifier.decision_function(X_test)

In [57]:
precision_recall_fscore_support(y_test, y_pred_new, average='binary')

(0.8409090909090909, 0.925, 0.8809523809523809, None)

In [58]:
accuracy_score(y_test, y_pred_new)

0.8305084745762712

#### tune the parameter

In [76]:
from sklearn.model_selection import GridSearchCV

In [129]:
parameters = {
    'C': np.linspace(1, 5, 5),
    'solver':['newton-cg','lbfgs','liblinear'],
    'class_weight':[None,'balanced']
             }
lr = LogisticRegression(random_state=0)
clf = GridSearchCV(lr, parameters, cv=5,n_jobs=3)
clf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'C': array([1., 2., 3., 4., 5.]), 'solver': ['newton-cg', 'lbfgs', 'liblinear'], 'class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [130]:
clf.best_params_

{'C': 4.0, 'class_weight': None, 'solver': 'liblinear'}

In [131]:
clf = GridSearchCV(lr, {'penalty':['l1','l2']}, cv=5,n_jobs=3)
clf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'penalty': ['l1', 'l2']}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [132]:
clf.best_params_

{'penalty': 'l2'}

#### train the model again with optimized parameters

In [133]:
clf = LogisticRegression(C=4,penalty='l2',solver='liblinear',random_state=0)
clf.fit(X_train, y_train)
y_pred_opt = clf.predict(X_test)

In [134]:
precision_recall_fscore_support(y_test, y_pred_opt, average='binary')

(0.8390804597701149, 0.9125, 0.8742514970059879, None)

In [135]:
accuracy_score(y_test, y_pred_opt)

0.8220338983050848

#### though gridsearch suggested C=4, but after training the model, the accuracy is higher to set C=1
C: Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.

In [142]:
clf = LogisticRegression(C=1,penalty='l2',solver='liblinear',random_state=0)
clf.fit(X_train, y_train)
y_pred_final = clf.predict(X_test)
accuracy_score(y_test, y_pred_final)

0.8305084745762712

#### Coefficient of the features in the decision function.

In [143]:
clf.coef_ 

array([[-0.19954161, -1.22673667, -0.018489  , -0.71131447,  0.65721879,
        -0.01281874,  0.5190334 , -0.38626585]])

#### Intercept (a.k.a. bias) added to the decision function.

In [141]:
clf.intercept_

array([-0.10979743])

In [144]:
features.iloc[:,1:].drop("Is_satisfied", axis=1).apply(lambda x: x.corr(features.Is_satisfied))

num_rep               -0.477934
num_rep_per           -0.522149
len_conversation      -0.353808
total_compound_conv   -0.098092
tot_pos_sen            0.064181
tot_neg_sen           -0.022449
sum_tfidf              0.140452
avg_tfidf              0.026714
dtype: float64

In [12]:
from pandas import *
import numpy as np
from scipy.stats.stats import pearsonr
import itertools

In [14]:
correlations = {}
columns = features.columns[1:].tolist()

for col_a, col_b in itertools.combinations(columns, 2):
    correlations[col_a + '__' + col_b] = pearsonr(features.loc[:, col_a], features.loc[:, col_b])

result = DataFrame.from_dict(correlations, orient='index')
result.columns = ['PCC', 'p-value']

print(result.sort_index())

                                            PCC        p-value
avg_tfidf__Is_satisfied                0.026714   5.630495e-01
len_conversation__Is_satisfied        -0.353808   2.460829e-15
len_conversation__avg_tfidf           -0.550678   1.059853e-38
len_conversation__sum_tfidf            0.320102   1.105667e-12
len_conversation__tot_neg_sen          0.117055   1.100980e-02
len_conversation__tot_pos_sen          0.066764   1.479761e-01
len_conversation__total_compound_conv  0.141350   2.105281e-03
num_rep__Is_satisfied                 -0.477934   2.981528e-28
num_rep__avg_tfidf                    -0.354889   1.997778e-15
num_rep__len_conversation              0.858641  3.229487e-138
num_rep__num_rep_per                   0.856732  5.911338e-137
num_rep__sum_tfidf                     0.011395   8.051803e-01
num_rep__tot_neg_sen                   0.083387   7.060020e-02
num_rep__tot_pos_sen                   0.077346   9.361157e-02
num_rep__total_compound_conv           0.168678   2.356

#### Build the model with one variable

In [177]:
X = df_shuffle.iloc[:,:1]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [178]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [179]:
print('Accuracy of num_rep: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of num_rep: 0.70


In [175]:
X = df_shuffle.iloc[:,1:2]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of num_rep_per: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of num_rep_per: 0.73




In [181]:
X = df_shuffle.iloc[:,2:3]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of num_rep_exp: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of num_rep_exp: 0.72




In [184]:
X = df_shuffle.iloc[:,3:4]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of len_conversation: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of len_conversation: 0.66


