In [1]:
from __future__ import division
import os
import glob

import pandas as pd
import numpy as np

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\manon\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Global variable definition 

In [2]:
file_directory = '../Untitled folder/allconversations/'
text_files_sat = glob.glob('../Untitled folder/allconversations/sat/*.txt')
text_files_dsat = glob.glob ('../Untitled folder/allconversations/dsat/*.txt')

In [3]:
# sentences to delete
dic = {'thankyou':"", 'thank you':"", "goodbye":"", "good bye":"", 'bye':""}

### Function definition

In [4]:
def number_repetition(file):
    with open (file) as f:
        lines = f.readlines()
        new_list =[]
        for line in lines:
            if "SYSTEM" in line:
                new_list.append(line.strip())
    #create dictionary and count the repetition of sentence (Feature 1)
    new_dict = {}
    for li in new_list:
        if li in new_dict:
            new_dict[li] += 1
        else:
            new_dict[li] = 1
    num_rep = sum([value for value in new_dict.values() if value > 1])
    
    #create a matrix to calculate repetition (Feature 2)
    m = [[0]* len(new_list) for i in range(len(new_list))]
    for i, x in enumerate(new_list):
        for j, y in enumerate(new_list):
            if x == y:
                m[i][j] = 1 
    all_value = 0
    for li in m:
        a = sum(li)
        all_value+=a
    num_rep_exp = (all_value-len(new_list))/2
    
    #Percentage of the repetition sentence in the entire conversation (Feature 3)
    num_rep_per = num_rep/len(lines)
    #Number of sentences in the conversation (Feature 4)
    len_conversation = len(lines)
    
    return num_rep, num_rep_per,num_rep_exp,len_conversation

In [5]:
def pos_neg_conv(file):
    # get everything
    
    dict_conv_only = create_user_conv(file)
    compound_score, tot_pos_sen, tot_neg_sen = get_sentiment(dict_conv_only)
    return compound_score, tot_pos_sen, tot_neg_sen
    
    
def get_sentiment(dict_conv_only):
    # generate the compound sentiment score and store in conversation

    for sentence in dict_conv_only.keys():
        compound_sentence = generatesentiment(sentence)
        # add compound score to sentence 
        dict_conv_only[sentence] = compound_sentence
    # add total compound score of sentence to conversation
    compound_score = sum(dict_conv_only.values())/len(dict_conv_only.values())
    tot_pos_sen = len([x for x in dict_conv_only.values() if x > 0])/len(dict_conv_only.values())
    tot_neg_sen = len([x for x in dict_conv_only.values() if x < 0])/len(dict_conv_only.values())
    return compound_score, tot_pos_sen, tot_neg_sen
    
    
def create_user_conv(file):
    #create a dict per conversation of only USER sentences 
    # and 'thankyou goodbye left out'
    conv_only_user = {}
    with open(file, 'r') as f:
        data =f.readlines()
        new_list =[]
        for line in data:
            # only USER input
            if "USER" in line:
                # Remove the word '[USER]'
                line = line.replace("[USER]", "")
                line = line.strip()
                new_list.append(line)
        #strip to get the conversation id 
        file = file.replace('../Untitled folder/allconversations/sat\\', "")   
        file = file.replace('.txt', "") 
        # Remove 'thank you good bye' if in last sentences
        for i,j in dic.items():
            new_list[-1] = new_list[-1].replace(i, j)
            new_list[-2] = new_list[-2].replace(i, j)
        conv_only_user[file] = new_list
        
        # convert to dict 
        # with every conversation a dict with sentences as keys
        # and compound pos neg score (as 0)
        for conv in conv_only_user.values():
            new_dict2 = {}
            for sentence in conv:
                new_dict2[sentence] = 0
        return new_dict2       
        
def generatesentiment(sentence):
    # function for sentiment analysis on sentences
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence)
    return ss['compound']


   

### create feature dataframe for disatified dataset

In [6]:
columns = ['num_rep','num_rep_per','num_rep_exp','len_conversation', 'total_compound_conv', 'tot_pos_sen', 'tot_neg_sen', 'Is_satisfied']
df_dsat = pd.DataFrame(index=range(len(text_files_dsat)), columns=columns)

In [7]:
for idx,file in enumerate(text_files_dsat):
    num_rep, num_rep_per,num_rep_exp,len_conversation = number_repetition(file)
    compound_score, tot_pos_sen, tot_neg_sen = pos_neg_conv(file)
    df_dsat.iloc[idx] = pd.Series({'num_rep':num_rep, 'num_rep_per':num_rep_per,
                                    'num_rep_exp':num_rep_exp,'len_conversation':len_conversation,
                                       'total_compound_conv':compound_score, 'tot_pos_sen':tot_pos_sen, 
                                       'tot_neg_sen':tot_neg_sen, 'Is_satisfied':0})

In [8]:
df_dsat.head()

Unnamed: 0,num_rep,num_rep_per,num_rep_exp,len_conversation,total_compound_conv,tot_pos_sen,tot_neg_sen,Is_satisfied
0,7,0.291667,21,24,0.0,0.0,0.0,0
1,4,0.333333,2,12,0.10275,0.333333,0.166667,0
2,4,0.285714,2,14,0.0463857,0.142857,0.142857,0
3,0,0.0,0,10,0.0,0.0,0.0,0
4,4,0.25,2,16,0.0,0.0,0.0,0


### create feature dataframe for satified dataset

In [9]:
columns = ['num_rep','num_rep_per','num_rep_exp','len_conversation', 'total_compound_conv', 'tot_pos_sen', 'tot_neg_sen', 'Is_satisfied']
df_sat = pd.DataFrame(index=range(len(text_files_sat)), columns=columns)

In [10]:
for idx,file in enumerate(text_files_sat):
    num_rep, num_rep_per,num_rep_exp,len_conversation = number_repetition(file)
    compound_score, tot_pos_sen, tot_neg_sen = pos_neg_conv(file)
    df_sat.iloc[idx] = pd.Series({'num_rep':num_rep, 'num_rep_per':num_rep_per,
                                    'num_rep_exp':num_rep_exp,'len_conversation':len_conversation,
                                       'total_compound_conv':compound_score, 'tot_pos_sen':tot_pos_sen, 
                                       'tot_neg_sen':tot_neg_sen, 'Is_satisfied':1})

In [11]:
df_dsat.head()

Unnamed: 0,num_rep,num_rep_per,num_rep_exp,len_conversation,total_compound_conv,tot_pos_sen,tot_neg_sen,Is_satisfied
0,7,0.291667,21,24,0.0,0.0,0.0,0
1,4,0.333333,2,12,0.10275,0.333333,0.166667,0
2,4,0.285714,2,14,0.0463857,0.142857,0.142857,0
3,0,0.0,0,10,0.0,0.0,0.0,0
4,4,0.25,2,16,0.0,0.0,0.0,0
