Following this Q-Learning algorithm:
![Q-learnng](./q_learning.png)

### Learning beginning of sentence
state: {"Start": [0,1]}  
actions: question, opinion, elaborate, affirm, negate, neutral  
reward: start score

In [396]:
import pandas as pd
from pprint import pprint as pp
import itertools
import pickle
import functools

%autosave 40

Autosaving every 40 seconds


In [116]:
dialogs = pickle.load(open("../data/parsed_data.pkl", "rb"))

In [366]:
class RL_BEGIN(object):
    def __init__(self, states=[], reward=[], gamma=0.9, alpha=0.5):
        self.states = states
        self.actions = ["question", "opinion", "elaborate", "affirmative", "negative", "neutral"]
        self.reward = reward
        self.alpha = alpha
        self.gamma = gamma
        self.q = self.initialize(states, self.actions)

    def initialize(self, states, actions):
        df = pd.DataFrame(columns=["State","Action","Value"],
                          data=list(itertools.product(["1"], actions,[0])))
        return df
    
    def update_q(self, dialogue, n_iters=1):
        """ update q table based on one dialogue, 
            returns: the intermediate q values during iteration, only works on first example
            in this RL_BEGIN class, dialogues only have a start and reward, middle is removed
        """
        dialogue_short = dialogue.drop(dialogue.index[1:-1])
        dialogue_short.reset_index(drop=True, inplace=True)
        q_mid = self.q.copy()
        q_mid.rename(columns={"Value": "iter0"}, inplace=True)
        for i in range(n_iters):
            q_mid["iter{0}".format(i+1)] = q_mid["iter{0}".format(i)].copy()
            for turn in dialogue_short.index:
                # get values of current state
                action = dialogue_short.loc[turn]["Action"]
                states = "".join(list(dialogue_short.loc[turn][self.states]))
                if "T" in states:
                    break
                q = float(self.q[(self.q.Action==action)&(self.q.State==states)]["Value"])
                reward = np.mean(list(dialogue_short.loc[turn][self.reward]))
                
                # get values of next state
                next_turn = dialogue_short.loc[turn+1]
                next_states = "".join(list(next_turn[self.states]))
                next_max_q = self.find_max_q(next_states, i)
                if "T" in next_states:
                    reward = np.mean(list(next_turn[self.reward]))
                    
                # update q based on current reward, current q and next state q
                q = q + self.alpha * (reward + self.gamma * next_max_q - q)
                self.q.loc[(self.q.Action==action)&(self.q.State==states), "Value"] = q
                q_mid.loc[(q_mid.Action==action)&(q_mid.State==states), "iter{0}".format(i+1)] = q
        return q_mid
    
    
    def find_max_q(self, next_states, iteration):
        if "T" in next_states:
            return 0
        else:
            q = self.q[(self.q.State==next_states)]
            return max(list(q["Value"]))

In [367]:
starter_RL = RL_BEGIN(states=["Start"], reward=["start"])
q_mid = starter_RL.update_q(example, n_iters=10)

In [368]:
example[["Start", "Action", "start"]]

Unnamed: 0,Start,Action,start
0,1,opinion,0
1,0,elaborate,0
2,0,question,0
3,0,opinion,0
4,0,question,0
5,0,question,0
6,0,affirmative,0
7,0,neutral,0
8,0,negative,0
9,0,affirmative,0


In [369]:
starter_RL.q

Unnamed: 0,State,Action,Value
0,1,question,0.0
1,1,opinion,3.996094
2,1,elaborate,0.0
3,1,affirmative,0.0
4,1,negative,0.0
5,1,neutral,0.0


In [363]:
q_mid

Unnamed: 0,Beginning,Action,iter0,iter1,iter2,iter3,iter4,iter5,iter6,iter7,iter8,iter9,iter10
0,1,question,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,opinion,0,2.0,3.0,3.5,3.75,3.875,3.9375,3.96875,3.984375,3.992188,3.996094
2,1,elaborate,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,affirmative,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,negative,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,neutral,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [278]:
starter_RL = RL_BEGIN(states=["Start"], reward=["start"],
                actions=["question", "opinion", "elaborate", "affirmative", "negative", "neutral"])
for key, value in dialogs.items():
    print(".", end="")
    starter_RL.update_q(value, n_iters=10)

.......................................................................................................................................................................................................................................................................................................................................

In [279]:
starter_RL.q

Unnamed: 0,State,Action,Value
0,1,question,5.0
1,1,opinion,2.999024
2,1,elaborate,1.998048
3,1,affirmative,0.000978
4,1,negative,1.0
5,1,neutral,0.999999


### Learning question/answer
state: "Question: 0/1" 
actions: question, opinion, elaborate, affirm, negate, neutral  
reward: intertupt score

In [371]:
class RL_QA(object):
    def __init__(self, states=[], reward=[], gamma=0.9, alpha=0.5):
        self.states = states
        self.actions = ["question", "opinion", "elaborate", "affirmative", "negative", "neutral"]
        self.reward = reward
        self.alpha = alpha
        self.gamma = gamma
        self.q = self.initialize(states, self.actions)

    def initialize(self, states, actions):
        df = pd.DataFrame(columns=["State","Action","Value"],
                          data=list(itertools.product(["0", "1"], actions,[0])))
        return df
    
    def update_q(self, dialogue, n_iters=1):
        """ update q table based on one dialogue, 
            returns: the intermediate q values during iteration, only works on first example
        """
        q_mid = self.q.copy()
        q_mid.rename(columns={"Value": "iter0"}, inplace=True)
        for i in range(n_iters):
            q_mid["iter{0}".format(i+1)] = q_mid["iter{0}".format(i)].copy()
            for turn in dialogue.index:
                # get values of current state
                action = dialogue.loc[turn]["Action"]
                states = "".join(list(dialogue.loc[turn][self.states]))
                if "T" in states:
                    break
                q = float(self.q[(self.q.Action==action)&(self.q.State==states)]["Value"])
                reward = np.mean(list(dialogue.loc[turn][self.reward]))
                
                # get values of next state
                next_turn = dialogue.loc[turn+1]
                next_states = "".join(list(next_turn[self.states]))
                next_max_q = self.find_max_q(next_states, i)
                if "T" in next_states:
                    reward = np.mean(list(next_turn[self.reward]))
                    
                # update q based on current reward, current q and next state q
                q = q + self.alpha * (reward + self.gamma * next_max_q - q)
                self.q.loc[(self.q.Action==action)&(self.q.State==states), "Value"] = q
                q_mid.loc[(q_mid.Action==action)&(q_mid.State==states), "iter{0}".format(i+1)] = q
#             print(self.q)
        return q_mid
    
    
    def find_max_q(self, next_states, iteration):
        if "T" in next_states:
            return 0
        else:
            q = self.q[(self.q.State==next_states)]
            return max(list(q["Value"]))
        
def find_examples_w_user_question(dialogs):
    for filename, df in dialogs.items():
        if "1" in list(df["Question"]) and df.iloc[-1]["interupt"] > 3:
            return df

In [347]:
example = find_examples_w_user_question(dialogs)
question_RL = RL_QA(states=["Question"], reward=["interupt"])
q_mid = question_RL.update_q(example, n_iters=10)

In [348]:
example[["Question", "Action", "interupt"]]

Unnamed: 0,Question,Action,interupt
0,0,opinion,0
1,1,elaborate,0
2,0,question,0
3,0,opinion,0
4,0,question,0
5,0,question,0
6,0,affirmative,0
7,0,neutral,0
8,0,negative,0
9,0,affirmative,0


In [349]:
question_RL.q

Unnamed: 0,State,Action,Value
0,0,question,3.42356
1,0,opinion,3.286644
2,0,elaborate,0.0
3,0,affirmative,3.807296
4,0,negative,3.228435
5,0,neutral,3.228435
6,1,question,0.0
7,1,opinion,0.0
8,1,elaborate,3.398353
9,1,affirmative,0.0


In [350]:
q_mid

Unnamed: 0,State,Action,iter0,iter1,iter2,iter3,iter4,iter5,iter6,iter7,iter8,iter9,iter10
0,0,question,0,0.0,1.575,2.52,2.993484,3.221466,3.330141,3.38181,3.406359,3.41802,3.42356
1,0,opinion,0,0.0,0.9,1.755,2.36925,2.763675,3.002003,3.14061,3.219126,3.262755,3.286644
2,0,elaborate,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,affirmative,0,2.0,2.95,3.40125,3.615594,3.717407,3.765768,3.78874,3.799651,3.804834,3.807296
4,0,negative,0,0.0,0.855,1.688625,2.298347,2.69484,2.936611,3.078172,3.158772,3.203737,3.228435
5,0,neutral,0,0.0,0.855,1.688625,2.298347,2.69484,2.936611,3.078172,3.158772,3.203737,3.228435
6,1,question,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,opinion,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,elaborate,0,0.0,0.9,1.7775,2.419313,2.836673,3.09117,3.240181,3.325023,3.372355,3.398353
9,1,affirmative,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [351]:
question_RL = RL_QA(states=["Question"], reward=["interupt"])
for key, value in dialogs.items():
    print(".", end="")
    question_RL.update_q(value, n_iters=10)

.......................................................................................................................................................................................................................................................................................................................................

In [352]:
question_RL.q

Unnamed: 0,State,Action,Value
0,0,question,4.480888
1,0,opinion,4.033994
2,0,elaborate,4.033066
3,0,affirmative,4.035225
4,0,negative,4.481397
5,0,neutral,2.871396
6,1,question,3.238329
7,1,opinion,4.480069
8,1,elaborate,2.001952
9,1,affirmative,4.481025


### Learning Everything (except for Beginning)
state: Question: 0/1, Sentiment: 0/1, Subjectivity: 0/1, Length: 0/1   
actions: question, opinion, elaborate, affirm, negate, neutral  
reward: overall score

In [428]:
class RL(object):
    def __init__(self, reward=[], gamma=0.9, alpha=0.5):
        self.states = ["Question", "Sentiment", "Subjectivity", "Length"]
        self.actions = ["question", "opinion", "elaborate", "affirmative", "negative", "neutral"]
        self.reward = reward
        self.alpha = alpha
        self.gamma = gamma
        self.q = self.initialize(self.states, self.actions)

    def initialize(self, states, actions):
        df = pd.DataFrame(columns = self.states + ["Action","Value"],
                          data=list(itertools.product([0,1],[0,1],[0,1],[0,1],actions,[0])))
        return df
    
    def update_q(self, dialogue, n_iters=1):
        """ update q table based on one dialogue, 
            returns: the intermediate q values during iteration, only works on first example
        """
        q_mid = self.q.copy()
        q_mid.rename(columns={"Value": "iter0"}, inplace=True)
        for i in range(n_iters):
            q_mid["iter{0}".format(i+1)] = q_mid["iter{0}".format(i)].copy()
            for turn in dialogue.index:
                # get values of current state
                action = dialogue.loc[turn]["Action"]
                states = dialogue.loc[turn][self.states]
                if "T" in list(states):
                    break
                c1 = self.q.Action==action
                c2 = self.q.Question==int(states.Question)
                c3 = self.q.Sentiment==int(states.Sentiment)
                c4 = self.q.Subjectivity==int(states.Subjectivity)
                c5 = self.q.Length==int(states.Length)
                                            
                q = float(self.q[conjunction(c1,c2,c3,c4,c5)]["Value"])
                reward = np.mean(list(dialogue.loc[turn][self.reward]))
                # get values of next state
                next_turn = dialogue.loc[turn+1]
                next_states = "".join(list(next_turn[self.states]))
                next_max_q = self.find_max_q(next_states, i, query=conjunction(c2,c3,c4,c5))
                if "T" in next_states:
                    reward = np.mean(list(next_turn[self.reward]))
                    
                # update q based on current reward, current q and next state q
                q = q + self.alpha * (reward + self.gamma * next_max_q - q)
                self.q.loc[conjunction(c1,c2,c3), "Value"] = q
                
                m1 = q_mid.Action==action
                m2 = q_mid.Question==int(states.Question)
                m3 = q_mid.Sentiment==int(states.Sentiment)
                m4 = q_mid.Subjectivity==int(states.Subjectivity)
                m5 = q_mid.Length==int(states.Length)
                q_mid.loc[conjunction(m1,m2,m3,m4,m5), "iter{0}".format(i+1)] = q
        return q_mid
    
    
    def find_max_q(self, next_states, iteration, query=None):
        if "T" in next_states:
            return 0
        else:
            q = self.q[query]
            return max(list(q["Value"]))

                                            
def conjunction(*conditions):
    return functools.reduce(np.logical_and, conditions)


def get_example(dialogs):
    for filename, df in dialogs.items():
        if df.iloc[-1]["overall"] > 3:
            return df

In [429]:
all_RL = RL(reward=["overall"])

In [430]:
example = get_example(dialogs)
q_mid = all_RL.update_q(example, n_iters=10)

In [431]:
example[all_RL.states + ["Action", "overall"]]

Unnamed: 0,Question,Sentiment,Subjectivity,Length,Action,overall
0,0,0,0,0,question,0
1,0,0,1,0,elaborate,0
2,0,0,1,1,affirmative,0
3,0,1,1,0,affirmative,0
4,0,0,0,0,opinion,0
5,0,0,0,1,neutral,0
6,0,0,0,0,question,0
7,T,T,T,T,,4


In [432]:
all_RL.q

Unnamed: 0,Question,Sentiment,Subjectivity,Length,Action,Value
0,0,0,0,0,question,3.807296
1,0,0,0,0,opinion,3.228435
2,0,0,0,0,elaborate,3.228435
3,0,0,0,0,affirmative,3.228435
4,0,0,0,0,negative,0.000000
5,0,0,0,0,neutral,3.228435
6,0,0,0,1,question,3.807296
7,0,0,0,1,opinion,3.228435
8,0,0,0,1,elaborate,3.228435
9,0,0,0,1,affirmative,3.228435


In [433]:
q_mid

Unnamed: 0,Question,Sentiment,Subjectivity,Length,Action,iter0,iter1,iter2,iter3,iter4,iter5,iter6,iter7,iter8,iter9,iter10
0,0,0,0,0,question,0,2.0,2.950,3.401250,3.615594,3.717407,3.765768,3.788740,3.799651,3.804834,3.807296
1,0,0,0,0,opinion,0,0.0,0.855,1.688625,2.298347,2.694840,2.936611,3.078172,3.158772,3.203737,3.228435
2,0,0,0,0,elaborate,0,0.0,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0,0,0,0,affirmative,0,0.0,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0,0,0,0,negative,0,0.0,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,0,0,0,0,neutral,0,0.0,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,0,0,0,1,question,0,0.0,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0,0,0,1,opinion,0,0.0,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0,0,0,1,elaborate,0,0.0,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,0,0,0,1,affirmative,0,0.0,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [434]:
all_RL = RL(reward=["overall"])
for key, value in dialogs.items():
    print(".", end="")
    all_RL.update_q(value, n_iters=10)

.......................................................................................................................................................................................................................................................................................................................................

In [437]:
all_RL.q.sort_values(by="Value", ascending=False)

Unnamed: 0,Question,Sentiment,Subjectivity,Length,Action,Value
50,1,0,0,0,elaborate,3.000976
68,1,0,1,1,elaborate,3.000976
62,1,0,1,0,elaborate,3.000976
56,1,0,0,1,elaborate,3.000976
58,1,0,0,1,negative,3.000584
64,1,0,1,0,negative,3.000584
70,1,0,1,1,negative,3.000584
52,1,0,0,0,negative,3.000584
3,0,0,0,0,affirmative,2.998347
21,0,0,1,1,affirmative,2.998347
