#### Turn a dialogue into State->action->state->action ....-> terminal state, reward

In [128]:
import glob
import pandas as pd
import numpy as np
from textblob import TextBlob
import pickle

In [135]:
def get_state(utter):
    """ Question", Sentiment", "Subjectivity", Length"""
    utter = utter.split(":")[-1].strip("\n")
    state = []
    state.append(int(is_question(utter)))
    state.append(int(is_positive(utter)))
    state.append(int(is_subjective(utter)))
    state.append(int(is_long(utter)))
    return state
    
def is_question(utter):
    return utter[-1] == "?"

def is_positive(utter):
    senti = TextBlob(utter).sentiment.polarity
    return senti > 0
    
def is_subjective(utter):
    sub = TextBlob(utter).sentiment.subjectivity
    return sub > 0
    
def is_long(utter):
    return len(utter) > 20

def parse_reward(lastline):
    ratings = lastline.strip("\n").strip(",").split(",")
    rewards = []
    for category in ["overall", "start", "interupt", "engaing", "return"]:
        rewards.append([int(i.split("=")[1]) for i in ratings if category in i][0])
    return rewards

In [140]:
filenames = glob.glob("../data/300_convo/*")
parsed_dict = {}
for filename in filenames:
    lines = open(filename, "r", errors="replace").readlines()
    df_sa = pd.DataFrame(columns=["Start", "Question", "Sentiment", "Subjectivity", "Length", 
                                  "Action", "overall", "start", "interupt", "engaing", "return"])
    bot_lines = [i for i in lines[1:] if "Bot_" in i]
    user_lines = [i for i in lines if "_None_" in i]

    # initial state/action/reward
    df_sa.loc[0] = [1,0,0,0,0,lines[0].split("_")[1],0,0,0,0,0] 

    # intermediate state/action/reward
    for i, (user, bot) in enumerate(zip(user_lines[:-1], bot_lines)):
        df_sa.loc[i+1] = [0] + get_state(user) + [bot.split("_")[1]] + [0] * 5

    # terminal state/reward
    df_sa.loc[i+2] = ["T"] * 5 + ["None"] + parse_reward(lines[-1])
    parsed_dict[filename] = df_sa

In [143]:
pickle.dump(parsed_dict, open("../data/parsed_data.pkl", "wb"))

In [141]:
print(open(filename, "r").read())

Bot_elaborate_20:56:22:You gotta tell me more.
Yue Cao_None_20:56:31:blablablablablablabla
Bot_elaborate_20:56:31:Can you elaborate on that?
Yue Cao_None_20:56:37:Yes
Bot_neutral_20:56:37:haha
Yue Cao_None_20:56:40:haha
Bot_elaborate_20:56:40:Huh?
Yue Cao_None_20:56:44:Huh?
Bot_neutral_20:56:44:Well okay.
Yue Cao_None_20:56:47:Good
Bot_neutral_20:56:47:LOL
Yue Cao_None_20:56:50:LOL
Bot_affirmative_20:56:50:That's hilarious!
Yue Cao_None_20:56:58:That's hilarious
Bot_opinion_20:56:58:Nothing isn't better or worse than anything. Nothing is just nothing.
Yue Cao_None_20:57:14:You a right
Bot_neutral_20:57:14:Really?
Yue Cao_None_20:57:20:right
overall=2,start=2,interupt=2,engaing=4,return=3,


In [142]:
df_sa

Unnamed: 0,Start,Question,Sentiment,Subjectivity,Length,Action,overall,start,interupt,engaing,return
0,1,0,0,0,0,elaborate,0,0,0,0,0
1,0,0,0,0,1,elaborate,0,0,0,0,0
2,0,0,0,0,0,neutral,0,0,0,0,0
3,0,0,1,1,0,elaborate,0,0,0,0,0
4,0,1,0,0,0,neutral,0,0,0,0,0
5,0,0,1,1,0,neutral,0,0,0,0,0
6,0,0,1,1,0,affirmative,0,0,0,0,0
7,0,0,1,1,0,opinion,0,0,0,0,0
8,0,0,1,1,0,neutral,0,0,0,0,0
9,T,T,T,T,T,,2,2,2,4,3
