In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import math
import string

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
nltk.download('stopwords')

import gensim
from gensim.models import KeyedVectors

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\manon\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### function definition

In [22]:
def read_file(file):
    with open (file) as f:
        lines = [line.rstrip('\n') for line in f]
    id_name = file.split("/")[-1].split(".txt")[0]
    id_name = id_name.split("\\")[-1]
    return id_name,lines

In [3]:
def number_repetition(lines):
    new_list =[]
    for line in lines:
        if "SYSTEM" in line:
            new_list.append(line.strip())
    #create dictionary and count the repetition of sentence (Feature 1)
    new_dict = {}
    for li in new_list:
        if li in new_dict:
            new_dict[li] += 1
        else:
            new_dict[li] = 1
    num_rep = sum([value for value in new_dict.values() if value > 1])
    #Percentage of the repetition sentence in the entire conversation (Feature 3)
    num_rep_per = num_rep/len(new_list)
    
    return num_rep,num_rep_per, len(lines)



In [4]:
def pos_neg_conv(file):
    # get everything
    
    dict_conv_only = create_user_conv(file)
    compound_score, tot_pos_sen, tot_neg_sen = get_sentiment(dict_conv_only)
    return compound_score, tot_pos_sen, tot_neg_sen
    
    
def get_sentiment(dict_conv_only):
    # generate the compound sentiment score and store in conversation

    for sentence in dict_conv_only.keys():
        compound_sentence = generatesentiment(sentence)
        # add compound score to sentence 
        dict_conv_only[sentence] = compound_sentence
    # add total compound score of sentence to conversation
    compound_score = sum(dict_conv_only.values())/len(dict_conv_only.values())
    tot_pos_sen = len([x for x in dict_conv_only.values() if x > 0])/len(dict_conv_only.values())
    tot_neg_sen = len([x for x in dict_conv_only.values() if x < 0])/len(dict_conv_only.values())
    return compound_score, tot_pos_sen, tot_neg_sen
    
    
def create_user_conv(lines):
    #create a list per conversation of only USER sentences 
    # and 'thankyou goodbye left out'
    conv_only_user = []
    new_list =[]
    for line in lines:
        # only USER input
        if "USER" in line:
            # Remove the word '[USER]'
            line = line.replace("[USER]", "")
            line = line.strip()
            new_list.append(line)
    # Remove 'thank you good bye' if in last sentences
    for i,j in dic.items():
        new_list[-1] = new_list[-1].replace(i, j)
        new_list[-2] = new_list[-2].replace(i, j)
    conv_only_user.append(new_list)

    # convert to dict 
    # with every conversation a dict with sentences as keys
    # and compound pos neg score (as 0)
    for conv in conv_only_user:
        new_dict2 = {}
        for sentence in conv:
            new_dict2[sentence] = 0
    return new_dict2       
        
def generatesentiment(sentence):
    # function for sentiment analysis on sentences
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence)
    return ss['compound']

#### Repetition of the chatbot / 4 features

In [5]:
import glob
text_files_dsat = glob.glob("./allconversations/dsat/*.txt")
text_files_sat = glob.glob("./allconversations/sat/*.txt")

In [6]:
# sentences to delete
dic = {'thankyou':"", 'thank you':"", "goodbye":"", "good bye":"", 'bye':""}

### Pre-processing for (google) Word2Vec

In [7]:
# Load Google's Word2Vec model
model = KeyedVectors.load_word2vec_format('./google_model/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [8]:
# Get the stop words
stop_words = stopwords.words('english')
stop_words.append("user")
stop_words.append("system")
stop_words = set(stop_words)


In [9]:
translator = str.maketrans('', '', string.punctuation)

# Helper function to tokenize a string
def tokenize_sentence(sentence):
    line = []
    sentence = sentence.strip()
    split = str(sentence).lower().translate(translator).split()
    for word in split:
        if word in stop_words:
            continue
        if word not in model.vocab:
            continue
        line.append(word) 
    return line
            

In [10]:
# generate Word2Vec score per sentence (tokenized words in list)            
def get_sentence_W2V_score(sentence):
    sentence_score = []
    for word in sentence:
        if word == '':
            continue
        word_score =(model[word])
        sentence_score.append(word_score)
   
    sentence_score = np.array(sentence_score)
    sentence_score = np.mean(sentence_score)
    return sentence_score
    


In [11]:

def generate_conversation_w2v_score(lines):  
    conversation_score = []
    for line in lines:
        line = tokenize_sentence(line)
        sentence_score = get_sentence_W2V_score(line)
        conversation_score.append(sentence_score)
    conversation_score = np.array(conversation_score)
    conversation_score = conversation_score[~np.isnan(conversation_score)]
    avg_conv_score = np.mean(conversation_score, axis=0)
    sum_conv_score = np.sum(conversation_score, axis=0)
    return avg_conv_score, sum_conv_score



In [12]:
columns = ['id','num_rep','num_rep_per','len_conversation','total_compound_conv', 'tot_pos_sen', 'tot_neg_sen', 'avg_w2v_score', 'sum_w2v_score', 'Is_satisfied']
df_rep_dsat = pd.DataFrame(index=range(len(text_files_dsat)), columns=columns)

In [24]:
for idx,file in enumerate(text_files_dsat):
    id_number,lines = read_file(file)
    num_rep, num_rep_per,len_conversation = number_repetition(lines)
    compound_score, tot_pos_sen, tot_neg_sen = pos_neg_conv(lines)
    avg_w2v_score, sum_w2v_score  = generate_conversation_w2v_score(lines)
    df_rep_dsat.iloc[idx] = pd.Series({'id':id_number, 'num_rep':num_rep, 'num_rep_per':num_rep_per,
                                      'len_conversation':len_conversation,
                                       'total_compound_conv':compound_score, 'tot_pos_sen':tot_pos_sen, 
                                       'tot_neg_sen':tot_neg_sen, 'avg_w2v_score': avg_w2v_score,
                                       'sum_w2v_score': sum_w2v_score, 
                                       'Is_satisfied':0})

In [25]:
df_rep_dsat.head()

Unnamed: 0,id,num_rep,num_rep_per,len_conversation,total_compound_conv,tot_pos_sen,tot_neg_sen,avg_w2v_score,sum_w2v_score,Is_satisfied
0,voip-0241bbae39-20130327_195830,7,0.583333,24,0.0,0.0,0.0,-0.0029501,-0.0619521,0
1,voip-0f41c16f2f-20130325_192310,4,0.666667,12,0.10275,0.333333,0.166667,-0.00621081,-0.068319,0
2,voip-0f41c16f2f-20130325_193723,4,0.571429,14,0.0463857,0.142857,0.142857,-0.0032949,-0.0461286,0
3,voip-0f41c16f2f-20130325_204340,0,0.0,10,0.0,0.0,0.0,-0.00535927,-0.0535927,0
4,voip-0f41c16f2f-20130402_005414,4,0.5,16,0.0,0.0,0.0,-0.00718453,-0.114952,0


#### create feature dataframe for satisfied dataset

In [26]:
columns = ['id','num_rep','num_rep_per','len_conversation','total_compound_conv', 'tot_pos_sen', 'tot_neg_sen', 'avg_w2v_score', 'sum_w2v_score',  'Is_satisfied']
df_rep_sat = pd.DataFrame(index=range(len(text_files_sat)), columns=columns)

In [27]:
for idx,file in enumerate(text_files_sat):
    id_number,lines = read_file(file)
    num_rep, num_rep_per,len_conversation = number_repetition(lines)
    compound_score, tot_pos_sen, tot_neg_sen = pos_neg_conv(lines)
    avg_w2v_score, sum_w2v_score = generate_conversation_w2v_score(lines)
    df_rep_sat.iloc[idx] = pd.Series({'id':id_number, 'num_rep':num_rep, 'num_rep_per':num_rep_per,
                                      'len_conversation':len_conversation,
                                       'total_compound_conv':compound_score, 'tot_pos_sen':tot_pos_sen, 
                                       'tot_neg_sen':tot_neg_sen, 'avg_w2v_score': avg_w2v_score,
                                       'sum_w2v_score': sum_w2v_score, 'Is_satisfied':1})

In [28]:
df_rep_sat.head()

Unnamed: 0,id,num_rep,num_rep_per,len_conversation,total_compound_conv,tot_pos_sen,tot_neg_sen,avg_w2v_score,sum_w2v_score,Is_satisfied
0,voip-00d76b791d-20130327_011609,0,0,14,0.1003,0.285714,0.0,-0.00802308,-0.1043,1
1,voip-00d76b791d-20130327_012331,0,0,10,0.01544,0.2,0.0,-0.00458558,-0.0412702,1
2,voip-0241bbae39-20130327_190942,0,0,10,-0.0775,0.0,0.2,-0.00563244,-0.0563244,1
3,voip-03c2655d43-20130327_194221,0,0,6,0.0257333,0.333333,0.0,-0.00702347,-0.0421408,1
4,voip-03c2655d43-20130327_194616,0,0,6,0.0257333,0.333333,0.0,-0.0080021,-0.0480126,1


In [252]:
print('Number of disatisfied files: {}'.format(len(df_rep_dsat)))
print('Number of satisfied files: {}'.format(len(df_rep_sat)))

Number of disatisfied files: 157
Number of satisfied files: 314


#### concat 2 dataframe

In [29]:
new_df = pd.concat([df_rep_dsat,df_rep_sat])

#### export dataframe to csv

In [15]:
#df_shuffle = new_df.sample(frac=1).reset_index(drop=True)

In [31]:
new_df.to_csv("features_w2v.csv",index=False)

#### Logistic regression

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [24]:
X = new_df.iloc[:,1:-1]
y = new_df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [25]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [26]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [27]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[20 18]
 [ 4 76]]


In [28]:
y_pred_new = classifier.predict_proba(X_test)
y_pred_new2 = classifier.decision_function(X_test)

In [30]:
import json
dis_score = round(y_pred_new[0][0]*100,2)
sat_score = round(y_pred_new[0][1]*100,2)
result = {'sat_score': sat_score}
print(json.dumps(result))



{"sat_score": 61.16}


In [29]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.81


In [32]:
from pandas import *
import numpy as np
from scipy.stats.stats import pearsonr
import itertools

In [36]:
correlations = {}
columns = df_shuffle.columns.tolist()
columns.remove('id')


for col_a, col_b in itertools.combinations(columns, 2):
    correlations[col_a + '__' + col_b] = pearsonr(df_shuffle.loc[:, col_a], df_shuffle.loc[:, col_b])

result = DataFrame.from_dict(correlations, orient='index')
result.columns = ['PCC', 'p-value']

print(result.sort_index())

                                            PCC        p-value
len_conversation__Is_satisfied        -0.353808   2.460829e-15
len_conversation__tot_neg_sen          0.117055   1.100980e-02
len_conversation__tot_pos_sen          0.066764   1.479761e-01
len_conversation__total_compound_conv  0.141350   2.105281e-03
num_rep__Is_satisfied                 -0.477934   2.981528e-28
num_rep__len_conversation              0.858641  3.229487e-138
num_rep__num_rep_per                   0.856732  5.911338e-137
num_rep__tot_neg_sen                   0.083387   7.060020e-02
num_rep__tot_pos_sen                   0.077346   9.361157e-02
num_rep__total_compound_conv           0.168678   2.356180e-04
num_rep_per__Is_satisfied             -0.522149   2.668523e-34
num_rep_per__len_conversation          0.592383   5.972811e-46
num_rep_per__tot_neg_sen               0.048848   2.900849e-01
num_rep_per__tot_pos_sen               0.040176   3.843204e-01
num_rep_per__total_compound_conv       0.172967   1.617

In [33]:
correlations = {}
columns = new_df.columns.tolist()
columns.remove('id')

col_b ='Is_satisfied'

for col_a in columns:
    correlations[col_a + '__' + col_b] = pearsonr(new_df.loc[:, col_a], new_df.loc[:, col_b])

result = DataFrame.from_dict(correlations, orient='index')
result.columns = ['PCC', 'p-value']

print(result.sort_index())

                                        PCC       p-value
Is_satisfied__Is_satisfied         1.000000  0.000000e+00
avg_w2v_score__Is_satisfied       -0.041356  3.705093e-01
len_conversation__Is_satisfied    -0.353808  2.460829e-15
num_rep__Is_satisfied             -0.477934  2.981528e-28
num_rep_per__Is_satisfied         -0.522149  2.668523e-34
sum_w2v_score__Is_satisfied        0.277596  8.820802e-10
tot_neg_sen__Is_satisfied         -0.022449  6.269947e-01
tot_pos_sen__Is_satisfied          0.064181  1.643388e-01
total_compound_conv__Is_satisfied -0.098092  3.330986e-02


#### Build the model with one variable

In [177]:
X = df_shuffle.iloc[:,:1]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [178]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [179]:
print('Accuracy of num_rep: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of num_rep: 0.70


In [175]:
X = df_shuffle.iloc[:,1:2]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of num_rep_per: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of num_rep_per: 0.73




In [181]:
X = df_shuffle.iloc[:,2:3]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of num_rep_exp: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of num_rep_exp: 0.72




In [184]:
X = df_shuffle.iloc[:,3:4]
y = df_shuffle.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of len_conversation: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of len_conversation: 0.66




In [39]:
# Average and SUM word embedding
X = new_df.iloc[:,7:9]
y = new_df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
y_train = y_train.astype('int')
y_test = y_test.astype('int')
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)
print('Accuracy of len_conversation: {:.2f}'.format(classifier.score(X_test, y_test)))

Accuracy of len_conversation: 0.68




Unnamed: 0,avg_w2v_score,sum_w2v_score
0,-0.0029501,-0.0619521
1,-0.00621081,-0.068319
2,-0.0032949,-0.0461286
3,-0.00535927,-0.0535927
4,-0.00718453,-0.114952
5,-0.00606689,-0.0364014
6,-0.00656565,-0.229798
7,-0.0042346,-0.0804575
8,-0.00542563,-0.244153
9,-0.00599499,-0.29975
