#### Turn a dialogue into State->action->state->action ....-> terminal state, reward

In [12]:
import glob
import pandas as pd
import numpy as np
from textblob import TextBlob
import pickle

In [13]:
def get_state(utter):
    """ Question, Sentiment, Subjectivity, Length"""
    utter = utter.split(":")[-1].strip("\n")
    state = []
    state.append(str(int(is_question(utter))))
    state.append(str(int(is_positive(utter))))
    state.append(str(int(is_long(utter))))
    state.append(str(int(is_subjective(utter))))
    return state
    
def is_question(utter):
    return utter[-1] == "?"

def is_positive(utter):
    senti = TextBlob(utter).sentiment.polarity
    return senti > 0
    
def is_subjective(utter):
    sub = TextBlob(utter).sentiment.subjectivity
    return sub > 0.5
    
def is_long(utter):
    return len(utter) > 20

def set_reward(df, lastline):
    ratings = lastline.strip("\n").strip(",").split(",")
    for rating in ratings:
        category, score = rating.split("=")
        df.set_value(df.index[-1], category, int(score))
    return df

In [15]:
filenames = glob.glob("../data/300_convo/*")
parsed_dict = {}
for filename in filenames:
    lines = open(filename, "r", errors="replace").readlines()
    df_sa = pd.DataFrame(columns=["Start", "Question", "Sentiment", "Length", "Subjective", "Previous", 
                                  "Action", "overall", "start", "interupt", "engaing", "return"])
    bot_lines = [i for i in lines[1:] if "Bot_" in i]
    user_lines = [i for i in lines if "_None_" in i]

    # initial state/action/reward
    action = lines[0].split("_")[1]
    df_sa.loc[0] = ["1","0","0","0","0","None",action,0,0,0,0,0] 
    previous_action = action
    
    # intermediate state/action/reward
    for i, (user, bot) in enumerate(zip(user_lines[:-1], bot_lines)):
        action = bot.split("_")[1]
        df_sa.loc[i+1] = ["0"] + get_state(user) + [previous_action, action] + [0]*5
        previous_action = action
        
    # terminal state/reward
    df_sa.loc[i+2] = ["T"]*6 + ["None"] + [0]*5
    df_sa = set_reward(df_sa.copy(), lines[-1])
    
    parsed_dict[filename] = df_sa

In [17]:
pickle.dump(parsed_dict, open("../data/parsed_data.pkl", "wb"))

In [18]:
print(open(filename, "r").read())

Bot_elaborate_20:56:22:You gotta tell me more.
Yue Cao_None_20:56:31:blablablablablablabla
Bot_elaborate_20:56:31:Can you elaborate on that?
Yue Cao_None_20:56:37:Yes
Bot_neutral_20:56:37:haha
Yue Cao_None_20:56:40:haha
Bot_elaborate_20:56:40:Huh?
Yue Cao_None_20:56:44:Huh?
Bot_neutral_20:56:44:Well okay.
Yue Cao_None_20:56:47:Good
Bot_neutral_20:56:47:LOL
Yue Cao_None_20:56:50:LOL
Bot_affirmative_20:56:50:That's hilarious!
Yue Cao_None_20:56:58:That's hilarious
Bot_opinion_20:56:58:Nothing isn't better or worse than anything. Nothing is just nothing.
Yue Cao_None_20:57:14:You a right
Bot_neutral_20:57:14:Really?
Yue Cao_None_20:57:20:right
overall=2,start=2,interupt=2,engaing=4,return=3,


In [19]:
df_sa

Unnamed: 0,Start,Question,Sentiment,Length,Subjective,Previous,Action,overall,start,interupt,engaing,return
0,1,0,0,0,0,,elaborate,0,0,0,0,0
1,0,0,0,1,0,elaborate,elaborate,0,0,0,0,0
2,0,0,0,0,0,elaborate,neutral,0,0,0,0,0
3,0,0,1,0,0,neutral,elaborate,0,0,0,0,0
4,0,1,0,0,0,elaborate,neutral,0,0,0,0,0
5,0,0,1,0,1,neutral,neutral,0,0,0,0,0
6,0,0,1,0,1,neutral,affirmative,0,0,0,0,0
7,0,0,1,0,1,affirmative,opinion,0,0,0,0,0
8,0,0,1,0,1,opinion,neutral,0,0,0,0,0
9,T,T,T,T,T,T,,2,2,2,4,3


In [23]:
parsed_dict["../data/300_convo/wei_2017-05-05_10.txt"]

Unnamed: 0,Start,Question,Sentiment,Length,Subjective,Previous,Action,overall,start,interupt,engaing,return
0,1,0,0,0,0,,question,0,0,0,0,0
1,0,0,1,1,0,question,elaborate,0,0,0,0,0
2,0,0,1,1,1,elaborate,affirmative,0,0,0,0,0
3,0,1,0,1,0,affirmative,opinion,0,0,0,0,0
4,0,0,1,1,1,opinion,opinion,0,0,0,0,0
5,0,0,1,1,0,opinion,affirmative,0,0,0,0,0
6,T,T,T,T,T,T,,5,5,5,4,5
