In [1]:
from __future__ import division
import pandas as pd
import numpy as np

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\manon\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [24]:
data_path = "../conversations/dsat/voip-0f41c16f2f-20130325_192310.txt"

In [25]:
with open (data_path) as f:
    data = f.read()

In [26]:
print(data)

[SYSTEM] Hello , welcome to the Cambridge restaurant system? You can ask for restaurants by area , price range or food type . How may I help you?
[USER]   i'm looking for a restaurant that serves german food
[SYSTEM] You are looking for a german restaurant right?
[USER]   no
[SYSTEM] Let me confirm , You are looking for a restaurant in the moderate price range right?
[USER]   yes
[SYSTEM] I'm sorry but there is no restaurant serving german food
[USER]   how about free
[SYSTEM] I'm sorry but there is no restaurant serving german food
[USER]   a restaurant serving french food
[SYSTEM] You are looking for a german restaurant right?
[USER]   thank you goodbye



### function definition

In [2]:
def read_file(file):
    with open (file) as f:
        lines = [line.rstrip('\n') for line in f]
    id_name = file.split("/")[-1].split(".txt")[0]
    return id_name,lines

In [3]:
def number_repetition(lines):
    new_list =[]
    for line in lines:
        if "SYSTEM" in line:
            new_list.append(line.strip())
    #create dictionary and count the repetition of sentence (Feature 1)
    new_dict = {}
    for li in new_list:
        if li in new_dict:
            new_dict[li] += 1
        else:
            new_dict[li] = 1
    num_rep = sum([value for value in new_dict.values() if value > 1])
    #Percentage of the repetition sentence in the entire conversation (Feature 3)
    num_rep_per = num_rep/len(new_list)
    
    return num_rep,num_rep_per, len(lines)



In [53]:
def pos_neg_conv(file):
    # get everything
    
    dict_conv_only = create_user_conv(file)
    compound_score, tot_pos_sen, tot_neg_sen = get_sentiment(dict_conv_only)
    return compound_score, tot_pos_sen, tot_neg_sen
    
    
def get_sentiment(dict_conv_only):
    # generate the compound sentiment score and store in conversation

    for sentence in dict_conv_only.keys():
        compound_sentence = generatesentiment(sentence)
        # add compound score to sentence 
        dict_conv_only[sentence] = compound_sentence
    # add total compound score of sentence to conversation
    compound_score = sum(dict_conv_only.values())/len(dict_conv_only.values())
    tot_pos_sen = len([x for x in dict_conv_only.values() if x > 0])/len(dict_conv_only.values())
    tot_neg_sen = len([x for x in dict_conv_only.values() if x < 0])/len(dict_conv_only.values())
    return compound_score, tot_pos_sen, tot_neg_sen
    
    
def create_user_conv(lines):
    #create a list per conversation of only USER sentences 
    # and 'thankyou goodbye left out'
    conv_only_user = []
    new_list =[]
    for line in lines:
        # only USER input
        if "USER" in line:
            # Remove the word '[USER]'
            line = line.replace("[USER]", "")
            line = line.strip()
            new_list.append(line)
    # Remove 'thank you good bye' if in last sentences
    for i,j in dic.items():
        new_list[-1] = new_list[-1].replace(i, j)
        new_list[-2] = new_list[-2].replace(i, j)
    conv_only_user.append(new_list)

    # convert to dict 
    # with every conversation a dict with sentences as keys
    # and compound pos neg score (as 0)
    for conv in conv_only_user:
        new_dict2 = {}
        for sentence in conv:
            new_dict2[sentence] = 0
    return new_dict2       
        
def generatesentiment(sentence):
    # function for sentiment analysis on sentences
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence)
    return ss['compound']

#### Repetition of the chatbot / 4 features

In [226]:
# def number_repetition(file):
#     with open (file) as f:
#         lines = f.readlines()
#         new_list =[]
#         for line in lines:
#             if "SYSTEM" in line:
#                 new_list.append(line.strip())
#     #create dictionary and count the repetition of sentence (Feature 1)
#     new_dict = {}
#     for li in new_list:
#         if li in new_dict:
#             new_dict[li] += 1
#         else:
#             new_dict[li] = 1
#     num_rep = sum([value for value in new_dict.values() if value > 1])
    
#     #create a matrix to calculate repetition (Feature 2)
#     m = [[0]* len(new_list) for i in range(len(new_list))]
#     for i, x in enumerate(new_list):
#         for j, y in enumerate(new_list):
#             if x == y:
#                 m[i][j] = 1 
#     all_value = 0
#     for li in m:
#         a = sum(li)
#         all_value+=a
#     num_rep_exp = (all_value-len(new_list))/2
    
#     #Percentage of the repetition sentence in the entire conversation (Feature 3)
#     num_rep_per = num_rep/len(new_list)
#     #Number of sentences in the conversation (Feature 4)
#     len_conversation = len(lines)
    
#     return num_rep, num_rep_per,num_rep_exp,len_conversation

In [34]:
import glob
text_files_dsat = glob.glob("./allconversations/dsat/*.txt")
text_files_sat = glob.glob("./allconversations/sat/*.txt")

In [41]:
# sentences to delete
dic = {'thankyou':"", 'thank you':"", "goodbye":"", "good bye":"", 'bye':""}

In [6]:
with open(text_files_dsat[6]) as f:
    print(f.read())

IndexError: list index out of range

#### create feature dataframe for disatisfied dataset

In [47]:
columns = ['id','num_rep','num_rep_per','len_conversation','total_compound_conv', 'tot_pos_sen', 'tot_neg_sen', 'Is_satisfied']
df_rep_dsat = pd.DataFrame(index=range(len(text_files_dsat)), columns=columns)

In [49]:
for idx,file in enumerate(text_files_dsat):
    id_number,lines = read_file(file)
    num_rep, num_rep_per,len_conversation = number_repetition(lines)
    compound_score, tot_pos_sen, tot_neg_sen = pos_neg_conv(lines)
    df_rep_dsat.iloc[idx] = pd.Series({'id':id_number, 'num_rep':num_rep, 'num_rep_per':num_rep_per,
                                  'len_conversation':len_conversation,
                                       'total_compound_conv':compound_score, 'tot_pos_sen':tot_pos_sen, 
                                       'tot_neg_sen':tot_neg_sen, 'Is_satisfied':0})

['', 'cheap', "i'm looking", 'in the west part of town', 'is there anything else', 'is there anything else', 'what options', 'in town', 'is it in', 'it in', 'what is the address', ' ']
{'': 0, 'cheap': 0, "i'm looking": 0, 'in the west part of town': 0, 'is there anything else': 0, 'what options': 0, 'in town': 0, 'is it in': 0, 'it in': 0, 'what is the address': 0, ' ': 0}
0.0 0.0 0.0
["i'm looking for a restaurant that serves german food", 'no', 'yes', 'how about free', 'a restaurant serving french food', ' ']
{"i'm looking for a restaurant that serves german food": 0, 'no': 0, 'yes': 0, 'how about free': 0, 'a restaurant serving french food': 0, ' ': 0}
0.10275000000000001 0.3333333333333333 0.16666666666666666
['hmm part', 'uh post', 'cheap restaurant', 'yes', 'chinese', ' can sorry', ' ']
{'hmm part': 0, 'uh post': 0, 'cheap restaurant': 0, 'yes': 0, 'chinese': 0, ' can sorry': 0, ' ': 0}
0.04638571428571429 0.14285714285714285 0.14285714285714285
['persian food', 'indian food', '

0.005433333333333336 0.08333333333333333 0.08333333333333333
['', 'cheap restaurant gastro pub food', "doesn't matter", 'gastro pub', 'cheap gastro pub', 'type of food', 'gastro pub food', 'gastro pub moderate', 'cheap restaurant gastro', 'type of food', 'gastro pub', 'gastro pub food', 'gastropub', 'gastro pub', ' ']
{'': 0, 'cheap restaurant gastro pub food': 0, "doesn't matter": 0, 'gastro pub': 0, 'cheap gastro pub': 0, 'type of food': 0, 'gastro pub food': 0, 'gastro pub moderate': 0, 'cheap restaurant gastro': 0, 'gastropub': 0, ' ': 0}
-0.0017363636363636364 0.0 0.09090909090909091
["i'm looking for a restaurant that serves bistro", 'yes', 'how about turkish food', 'canapes tea shop average', 'not castle with', 'turkish will do', 'any area', 'i do not yeah', "i'm looking for a like turkish type of restaurant", 'canapes food restaurant', "i'm looking for restaurant in any area that serves gastro full", 'how about the um', 'type of who', 'ok  ']
{"i'm looking for a restaurant that

['can i order call word world turkish', "let's start", 'reduce', 'ok', 'british food', "doesn't charge you school", 'parking full', 'thank you', 'restaurant', 'ok you school', 'thank you ok', 'thank you ok', 'thank you', 'a bar', ' ']
{'can i order call word world turkish': 0, "let's start": 0, 'reduce': 0, 'ok': 0, 'british food': 0, "doesn't charge you school": 0, 'parking full': 0, 'thank you': 0, 'restaurant': 0, 'ok you school': 0, 'thank you ok': 0, 'a bar': 0, ' ': 0}
0.11731538461538463 0.3076923076923077 0.0
['italian food', 'east', 'phone number', 'phone number', 'phone number', 'phone number', 'phone number', 'italian food', 'number', 'phone', 'are there', ' ']
{'italian food': 0, 'east': 0, 'phone number': 0, 'number': 0, 'phone': 0, 'are there': 0, ' ': 0}
0.02205714285714286 0.2857142857142857 0.0
['ok', 'ok', 'ok ok can i will', 'b three', 'what is it free', 'could i', "i don't phone this is the post code", 'ok', 'do you well any of them we start over in', "i don't", 'ok

['i want to find a restaurant', 'a cheaper bar', "i don't care", "no i'm looking for one", 'the cheap', 'pub', "i'm looking for a cheap restaurant in the west part of town", 'see', 'can i have the address and the', 'what', "what's the post code", "what's the address again", ' ']
{'i want to find a restaurant': 0, 'a cheaper bar': 0, "i don't care": 0, "no i'm looking for one": 0, 'the cheap': 0, 'pub': 0, "i'm looking for a cheap restaurant in the west part of town": 0, 'see': 0, 'can i have the address and the': 0, 'what': 0, "what's the post code": 0, "what's the address again": 0, ' ': 0}
-0.046638461538461545 0.07692307692307693 0.15384615384615385
['for restaurant in the centre', 'polynesian food', 'malaysian food', 'yes', 'british food', 'yes', 'british food', 'yes', 'british food', 'yes', ' ']
{'for restaurant in the centre': 0, 'polynesian food': 0, 'malaysian food': 0, 'yes': 0, 'british food': 0, ' ': 0}
0.06698333333333333 0.16666666666666666 0.0
['vietnamese food', 'cantone

['looking for a expensive restaurant that serves korean', "i'm sorry that be", 'is there expensive', 'is there one that serves free food', 'the phone number and post code please', ' ']
{'looking for a expensive restaurant that serves korean': 0, "i'm sorry that be": 0, 'is there expensive': 0, 'is there one that serves free food': 0, 'the phone number and post code please': 0, ' ': 0}
0.13586666666666666 0.3333333333333333 0.16666666666666666
['ok are restaurant that serves turkish food', 'is there any other restaurants turkish food', 'yes', 'are there any other restaurant that serves turkish', 'address the address and the post code please', ' ']
{'ok are restaurant that serves turkish food': 0, 'is there any other restaurants turkish food': 0, 'yes': 0, 'are there any other restaurant that serves turkish': 0, 'address the address and the post code please': 0, ' ': 0}
0.16935 0.5 0.0
['ah because find centre part of town that serves to we korean and now', "i'm not korean go delete area

-0.06507142857142857 0.14285714285714285 0.2857142857142857
['west', 'no', 'no', 'no', 'range', 'no', 'no', "don't care", 'french food', 'address', 'phone number', ' ']
{'west': 0, 'no': 0, 'range': 0, "don't care": 0, 'french food': 0, 'address': 0, 'phone number': 0, ' ': 0}
-0.0757875 0.125 0.25
['', "i don't care", 'town', 'address', 'turkish', 'moderate', 'restaurant', 'phone number', 'post code', ' ']
{'': 0, "i don't care": 0, 'town': 0, 'address': 0, 'turkish': 0, 'moderate': 0, 'restaurant': 0, 'phone number': 0, 'post code': 0, ' ': 0}
-0.031030000000000002 0.1 0.1
['ok cheap restaurant', 'east', 'address', 'food', 'there', ' ']
{'ok cheap restaurant': 0, 'east': 0, 'address': 0, 'food': 0, 'there': 0, ' ': 0}
0.04933333333333333 0.16666666666666666 0.0
['cheap restaurant', "don't care", ' ']
{'cheap restaurant': 0, "don't care": 0, ' ': 0}
-0.12916666666666668 0.0 0.3333333333333333
["i'm looking for a expensive restaurant", "that's good", "i'm looking for a expensive restau

0.14925 0.6666666666666666 0.0
['uh moderately priced restaurant that serves', 'indian', 'in any', 'i want uh asian oriental type', 'address', 'phone number', ' ']
{'uh moderately priced restaurant that serves': 0, 'indian': 0, 'in any': 0, 'i want uh asian oriental type': 0, 'address': 0, 'phone number': 0, ' ': 0}
0.02205714285714286 0.2857142857142857 0.0
['ah i would like to find a restaurant', 'can i have the restaurant', 'restaurant', 'irish food', 'double to restaurant then', '', '']
{'ah i would like to find a restaurant': 0, 'can i have the restaurant': 0, 'restaurant': 0, 'irish food': 0, 'double to restaurant then': 0, '': 0}
0.060200000000000004 0.16666666666666666 0.0
['expensive restaurant', 'south', 'expensive', 'french', 'italian', 'anything else', 'can i have the address', 'okay', ' ']
{'expensive restaurant': 0, 'south': 0, 'expensive': 0, 'french': 0, 'italian': 0, 'anything else': 0, 'can i have the address': 0, 'okay': 0, ' ': 0}
0.025144444444444445 0.111111111111

["i'm looking for a polynesian food", 'how about vietnamese', 'vietnamese', 'can i have the address', 'can i right okay ', ' ']
{"i'm looking for a polynesian food": 0, 'how about vietnamese': 0, 'vietnamese': 0, 'can i have the address': 0, 'can i right okay ': 0, ' ': 0}
0.03771666666666667 0.16666666666666666 0.0
['greek', "i'm looking for a restaurant in the west part of town to jamaican food", 'can need it is', 'give me can food', 'tell me towards then', 'what kind of that is that', 'what about type of food', 'what kind of food is that', 'sure to eat anything', 'see', 'what type of food', 'address', 'phone number', 'price range', 'okay  ']
{'greek': 0, "i'm looking for a restaurant in the west part of town to jamaican food": 0, 'can need it is': 0, 'give me can food': 0, 'tell me towards then': 0, 'what kind of that is that': 0, 'what about type of food': 0, 'what kind of food is that': 0, 'sure to eat anything': 0, 'see': 0, 'what type of food': 0, 'address': 0, 'phone number': 0

In [50]:
df_rep_dsat.head()

Unnamed: 0,id,num_rep,num_rep_per,len_conversation,total_compound_conv,tot_pos_sen,tot_neg_sen,Is_satisfied
0,dsat\voip-0241bbae39-20130327_195830,7,0.583333,24,0.0,0.0,0.0,0
1,dsat\voip-0f41c16f2f-20130325_192310,4,0.666667,12,0.10275,0.333333,0.166667,0
2,dsat\voip-0f41c16f2f-20130325_193723,4,0.571429,14,0.0463857,0.142857,0.142857,0
3,dsat\voip-0f41c16f2f-20130325_204340,0,0.0,10,0.0,0.0,0.0,0
4,dsat\voip-0f41c16f2f-20130402_005414,4,0.5,16,0.0,0.0,0.0,0


#### create feature dataframe for satisfied dataset

In [51]:
columns = ['id','num_rep','num_rep_per','len_conversation','total_compound_conv', 'tot_pos_sen', 'tot_neg_sen', 'Is_satisfied']
df_rep_sat = pd.DataFrame(index=range(len(text_files_sat)), columns=columns)

In [59]:
for idx,file in enumerate(text_files_sat):
    index,lines = read_file(file)
    num_rep, num_rep_per,len_conversation = number_repetition(lines)
    compound_score, tot_pos_sen, tot_neg_sen = pos_neg_conv(lines)
    df_rep_sat.iloc[idx] = pd.Series({'id':index,'num_rep':num_rep, 'num_rep_per':num_rep_per,
                                  'len_conversation':len_conversation,
                                       'total_compound_conv':compound_score, 'tot_pos_sen':tot_pos_sen, 
                                       'tot_neg_sen':tot_neg_sen, 'Is_satisfied':1})

In [60]:
df_rep_sat.head()

Unnamed: 0,id,num_rep,num_rep_per,len_conversation,total_compound_conv,tot_pos_sen,tot_neg_sen,Is_satisfied
0,sat\voip-00d76b791d-20130327_011609,0,0,14,0.1003,0.285714,0.0,1
1,sat\voip-00d76b791d-20130327_012331,0,0,10,0.01544,0.2,0.0,1
2,sat\voip-0241bbae39-20130327_190942,0,0,10,-0.0775,0.0,0.2,1
3,sat\voip-03c2655d43-20130327_194221,0,0,6,0.0257333,0.333333,0.0,1
4,sat\voip-03c2655d43-20130327_194616,0,0,6,0.0257333,0.333333,0.0,1


In [61]:
print('Number of disatisfied files: {}'.format(len(df_rep_dsat)))
print('Number of satisfied files: {}'.format(len(df_rep_sat)))

Number of disatisfied files: 157
Number of satisfied files: 314


#### concat 2 dataframe

In [236]:
new_df = pd.concat([df_rep_dsat,df_rep_sat])

#### shuffle the dataframe

In [237]:
df_shuffle = new_df.sample(frac=1).reset_index(drop=True)

In [203]:
df_shuffle.to_csv("features.csv",index=False)

#### Logistic regression

In [271]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [278]:
X = df_shuffle.iloc[:,:-1]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [279]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [280]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [281]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[22 13]
 [14 69]]


In [282]:
y_pred_new = classifier.predict_proba(X_test)
y_pred_new2 = classifier.decision_function(X_test)

In [283]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.77


In [284]:
from pandas import *
import numpy as np
from scipy.stats.stats import pearsonr
import itertools

In [168]:
correlations = {}
columns = df_shuffle.columns.tolist()

for col_a, col_b in itertools.combinations(columns, 2):
    correlations[col_a + '__' + col_b] = pearsonr(df_shuffle.loc[:, col_a], df_shuffle.loc[:, col_b])

result = DataFrame.from_dict(correlations, orient='index')
result.columns = ['PCC', 'p-value']

print(result.sort_index())

                                     PCC        p-value
len_conversation__Is_satisfied -0.353808   2.460829e-15
num_rep__Is_satisfied          -0.477934   2.981528e-28
num_rep__len_conversation       0.858641  3.229487e-138
num_rep__num_rep_exp            0.758841   2.193505e-89
num_rep__num_rep_per            0.856732  5.911338e-137
num_rep_exp__Is_satisfied      -0.347836   7.670654e-15
num_rep_exp__len_conversation   0.603272   5.067019e-48
num_rep_per__Is_satisfied      -0.522149   2.668523e-34
num_rep_per__len_conversation   0.592383   5.972811e-46
num_rep_per__num_rep_exp        0.561749   1.586553e-40


#### Build the model with one variable

In [177]:
X = df_shuffle.iloc[:,:1]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [178]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [179]:
print('Accuracy of num_rep: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of num_rep: 0.70


In [175]:
X = df_shuffle.iloc[:,1:2]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of num_rep_per: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of num_rep_per: 0.73




In [181]:
X = df_shuffle.iloc[:,2:3]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of num_rep_exp: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of num_rep_exp: 0.72




In [184]:
X = df_shuffle.iloc[:,3:4]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of len_conversation: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of len_conversation: 0.66


