Following this Q-Learning algorithm:
![Q-learnng](./q_learning.png)

### Learning beginning of sentence
state: {"Start": [0,1]}  
actions: question, opinion, elaborate, affirm, negate, neutral  
reward: start score

In [1]:
import pandas as pd
from pprint import pprint as pp
import itertools
import pickle

%autosave 40

Autosaving every 40 seconds


In [116]:
dialogs = pickle.load(open("../data/parsed_data.pkl", "rb"))

In [271]:
class RL_BEGIN(object):
    def __init__(self, states=[], actions=[], reward=[], gamma=0.9, alpha=0.5):
        self.states = states
        self.actions = actions
        self.reward = reward
        self.alpha = alpha
        self.gamma = gamma
        self.q = self.initialize(states, actions)

    def initialize(self, states, actions):
        df = pd.DataFrame(columns=["State","Action","Value"],
                          data=list(itertools.product(["1"], actions,[0])))
        return df
    
    def update_q(self, dialogue, n_iters=1):
        """ update q table based on one dialogue, 
            returns: the intermediate q values during iteration, only works on first example
            in this RL_BEGIN class, dialogues only have a start and reward, middle is removed
        """
        dialogue_short = dialogue.drop(dialogue.index[1:-1])
        dialogue_short.reset_index(drop=True, inplace=True)
        q_mid = self.q.copy()
        q_mid.rename(columns={"Value": "iter0"}, inplace=True)
        for i in range(n_iters):
            q_mid["iter{0}".format(i+1)] = q_mid["iter{0}".format(i)].copy()
            for turn in dialogue_short.index:
                # get values of current state
                action = dialogue_short.loc[turn]["Action"]
                states = "".join(list(dialogue_short.loc[turn][self.states]))
                if "T" in states:
                    break
                q = float(self.q[(self.q.Action==action)&(self.q.State==states)]["Value"])
                reward = np.mean(list(dialogue_short.loc[turn][self.reward]))
                
                # get values of next state
                next_turn = dialogue_short.loc[turn+1]
                next_states = "".join(list(next_turn[self.states]))
                next_max_q = self.find_max_q(next_states, i)
                if "T" in next_states:
                    reward = np.mean(list(next_turn[self.reward]))
                    
                # update q based on current reward, current q and next state q
                q = q + self.alpha * (reward + self.gamma * next_max_q - q)
                self.q.loc[(self.q.Action==action)&(self.q.State==states), "Value"] = q
                q_mid.loc[(q_mid.Action==action)&(q_mid.State==states), "iter{0}".format(i+1)] = q
#             print(self.q)
        return q_mid
    
    
    def find_max_q(self, next_states, iteration):
        if "T" in next_states:
            return 0
        else:
            q = self.q[(self.q.State==next_states)]
            return max(list(q["Value"]))

In [273]:
starter_RL = RL_BEGIN(states=["Start"], reward=["start"], 
                      actions=["question", "opinion", "elaborate", "affirmative", "negative", "neutral"])
# for dialog in dialogs.values()[0]:
q_mid = starter_RL.update_q(example, n_iters=10)

In [272]:
example[["Start", "Action", "start"]]

Unnamed: 0,Start,Action,start
0,1,question,0
1,0,affirmative,0
2,0,elaborate,0
3,0,question,0
4,0,elaborate,0
5,0,opinion,0
6,0,neutral,0
7,0,opinion,0
8,0,elaborate,0
9,T,,3


In [274]:
starter_RL.q

Unnamed: 0,State,Action,Value
0,1,question,2.99707
1,1,opinion,0.0
2,1,elaborate,0.0
3,1,affirmative,0.0
4,1,negative,0.0
5,1,neutral,0.0


In [275]:
q_mid

Unnamed: 0,State,Action,iter0,iter1,iter2,iter3,iter4,iter5,iter6,iter7,iter8,iter9,iter10
0,1,question,0,1.5,2.25,2.625,2.8125,2.90625,2.953125,2.976562,2.988281,2.994141,2.99707
1,1,opinion,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,elaborate,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,affirmative,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,negative,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,neutral,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [278]:
starter_RL = RL_BEGIN(states=["Start"], reward=["start"],
                actions=["question", "opinion", "elaborate", "affirmative", "negative", "neutral"])
for key, value in dialogs.items():
    print(".", end="")
    starter_RL.update_q(value, n_iters=10)

.......................................................................................................................................................................................................................................................................................................................................

In [279]:
starter_RL.q

Unnamed: 0,State,Action,Value
0,1,question,5.0
1,1,opinion,2.999024
2,1,elaborate,1.998048
3,1,affirmative,0.000978
4,1,negative,1.0
5,1,neutral,0.999999


### Learning question/answer
state: "Question" 
actions: "Q": question, "O": opinion, "E": elaborate, "Yes": affirm, "N": negate, "W": neutral  
reward: intertupt score

In [232]:
class RL_QA(object):
    def __init__(self, states=[], actions=[], reward=[], gamma=0.9, alpha=0.5):
        self.states = states
        self.actions = actions
        self.reward = reward
        self.alpha = alpha
        self.gamma = gamma
        self.q = self.initialize(states, actions)

    def initialize(self, states, actions):
        df = pd.DataFrame(columns=["State","Action","Value"],
                          data=list(itertools.product(["1"], actions,[0])))
        return df
    
    def update_q(self, dialogue, n_iters=1):
        """ update q table based on one dialogue, 
            returns: the intermediate q values during iteration, only works on first example
        """
        q_mid = self.q.copy()
        q_mid.rename(columns={"Value": "iter0"}, inplace=True)
        for i in range(n_iters):
            q_mid["iter{0}".format(i+1)] = q_mid["iter{0}".format(i)].copy()
            for turn in dialogue.index:
                # get values of current state
                action = dialogue.loc[turn]["Action"]
                states = "".join(list(dialogue.loc[turn][self.states]))                
                if "T" in states:
                    break
                q = float(self.q[(self.q.Action==action)&(self.q.State==states)]["Value"])
                reward = np.mean(list(dialogue.loc[turn][self.reward]))
                
                # get values of next state
                next_turn = dialogue.loc[turn+1]
                next_states = "".join(list(next_turn[self.states]))
                next_max_q = self.find_max_q(next_states, i)
                if "T" in next_states:
                    reward = np.mean(list(next_turn[self.reward]))
                    
                # update q based on current reward, current q and next state q
                q = q + self.alpha * (reward + self.gamma * next_max_q - q)
                self.q.loc[(self.q.Action==action)&(self.q.State==states), "Value"] = q
                q_mid.loc[(q_mid.Action==action)&(q_mid.State==states), "iter{0}".format(i+1)] = q
#             print(self.q)
        return q_mid
    
    
    def find_max_q(self, next_states, iteration):
        if "T" in next_states:
            return 0
        else:
            q = self.q[(self.q.State==next_states)]
            return max(list(q["Value"]))

In [15]:
list(itertools.product(*s))

[(0, 'A', 2), (0, 'A', 3), (1, 'A', 2), (1, 'A', 3)]