#### Turn a dialogue into State->action->state->action ....-> terminal state, reward

In [128]:
import glob
import pandas as pd
import numpy as np
from textblob import TextBlob
import pickle

In [125]:
def get_state(utter):
    """ Question", Sentiment", "Subjectivity", Length"""
    utter = utter.split(":")[-1].strip("\n")
    state = []
    state.append(int(is_question(utter)))
    state.append(is_positive(utter))
    state.append(is_subjective(utter))
    state.append(int(is_long(utter)))
    return state
    
def is_question(utter):
    return utter[-1] == "?"

def is_positive(utter):
    senti = TextBlob(utter).sentiment.polarity
    if senti > 0.2:
        return 1
    elif senti < -0.2:
        return -1
    else:
        return 0
    
def is_subjective(utter):
    sub = TextBlob(utter).sentiment.subjectivity
    if sub > 0.2:
        return 1
    elif sub < -0.2:
        return -1
    else:
        return 0
    
def is_long(utter):
    return len(utter) > 20

def parse_reward(lastline):
    ratings = lastline.strip("\n").strip(",").split(",")
    rewards = []
    for category in ["overall", "start", "interupt", "engaing", "return"]:
        rewards.append([int(i.split("=")[1]) for i in ratings if category in i][0])
    return rewards

In [126]:
filenames = glob.glob("../data/300_convo/*")
parsed_dict = {}
for filename in filenames:
    lines = open(filename, "r", errors="replace").readlines()
    df_sa = pd.DataFrame(columns=["Start", "Question", "Sentiment", "Subjectivity", "Length", 
                                  "Action", "overall", "start", "interupt", "engaing", "return"])
    bot_lines = [i for i in lines[1:] if "Bot_" in i]
    user_lines = [i for i in lines if "_None_" in i]

    # initial state/action/reward
    df_sa.loc[0] = [1,0,0,0,0,lines[0].split("_")[1],0,0,0,0,0] 

    # intermediate state/action/reward
    for i, (user, bot) in enumerate(zip(user_lines[:-1], bot_lines)):
        df_sa.loc[i+1] = [0] + get_state(user) + [bot.split("_")[1]] + [0] * 5

    # terminal state/reward
    df_sa.loc[i+2] = [0] + get_state(user_lines[-1]) + ["None"] + parse_reward(lines[-1])
    parsed_dict[filename] = df_sa

In [129]:
pickle.dump(parsed_dict, open("../data/parsed_data.pkl", "wb"))

In [133]:
print(open(filename, "r").read())

Bot_question_23:52:05:What plot would you want to happen differently?
abc_None_23:55:54:Not sure
Bot_elaborate_23:55:54:What?
abc_None_23:56:20:not sure what plot I want to happen differently
Bot_affirmative_23:56:20:Awesome
abc_None_23:56:33:Thanks!
Bot_affirmative_23:56:33:That's neat!
abc_None_23:56:49:alright!
Bot_opinion_23:56:49:Life is not a song, sweetling. Someday you may learn that, to your sorrow.
abc_None_00:00:23:May the force be with you
Bot_neutral_00:00:23:haha
abc_None_00:00:30:hohoho
Bot_question_00:00:30:What is true happiness?
abc_None_00:02:51:Bot becomes more smart
overall=4,start=4,interupt=4,engaing=4,return=4,


In [131]:
df

Unnamed: 0,Start,Question,Sentiment,Subjectivity,Length,Action,overall,start,interupt,engaing,return
0,1,0,0,0,0,question,0,0,0,0,0
1,0,0,-1,1,0,elaborate,0,0,0,0,0
2,0,0,0,1,1,affirmative,0,0,0,0,0
3,0,0,1,0,0,affirmative,0,0,0,0,0
4,0,0,0,0,0,opinion,0,0,0,0,0
5,0,0,0,0,1,neutral,0,0,0,0,0
6,0,0,0,0,0,question,0,0,0,0,0
7,0,0,1,1,1,,4,4,4,4,4
