## Train a bot to respond to user questions with answers and respond to user answers with questions with Reinforcement Learning and simulated interaction

In [14]:
import random
import itertools
from pprint import pprint as pp

In [15]:
user_utters = ["Hey", "Hmmm", "Haha", "Yo", "red", "25", "Just chilling", # answers
               "Hi?", "Really?", "Why?", "What's up?", "Are you smart?"] # questions
bot_utters = ["Right", "Yes", "No", "I don't know", "Maybe", "Hi", "I am good", "Never mind", # answers
              "How are you?", "What?", "What's that?", "Why not?", "Excuse me?", "Can you elaborate?"] # questions

In [16]:
# some helper functions
def is_question(utter):
    return utter[-1] == "?"

def get_question(utters):
    answers = [i for i in utters if i[-1] != "?"]
    questions = [i for i in utters if i[-1] == "?"]
    return random.choice(questions)

def get_answer(utters):
    answers = [i for i in utters if i[-1] != "?"]
    questions = [i for i in utters if i[-1] == "?"]
    return random.choice(answers)

def user_respond(bot_utter, mode="logically"):
    if mode=="logically":
        if is_question(bot_utter):
            return get_answer(user_utters)
        else:
            return get_question(user_utters)
    elif mode=="randomly":
        return random.choice(user_utters)

#### Random Sequence (where user acts rationally and bot acts randomly)

In [17]:
for turn in range(10):
    bot_utter = random.choice(bot_utters)
    print("Bot: ", bot_utter)
    print("User: ", user_respond(bot_utter))

Bot:  Yes
User:  Really?
Bot:  Maybe
User:  Why?
Bot:  What?
User:  Hey
Bot:  Can you elaborate?
User:  red
Bot:  Right
User:  Are you smart?
Bot:  How are you?
User:  25
Bot:  I am good
User:  Are you smart?
Bot:  What's that?
User:  Just chilling
Bot:  What?
User:  Hmmm
Bot:  What's that?
User:  Hey


#### Reinforcement learning (where bot learned to pick the right action by maximizing rewards)

In [35]:
# define transition function
def transition(s, a, user_behavior="logically"):
    """
    given a state s and an action a, return reward and a new state s_new:
    s: True or False (is question)
    a: "Q" or "A"
    s_new: True or False
    reward: 0 or 1
    """
    # calculate reward
    if (s and a == "Q"):
        reward = 0
    elif (s and a == "A"):
        reward = 1
    elif (not s and a == "Q"):
        reward = 1
    elif (not s and a == "A"):
        reward = 0
    else:
        raise
    
    # get new state
    if a == "Q":
        bot_utter = get_question(bot_utters)
    elif a == "A":
        bot_utter = get_answer(bot_utters)
    user_utter = user_respond(bot_utter, mode=user_behavior)
    s_new = is_question(user_utter)
        
    return reward, s_new


def carry_out_best_policy(user_say, best_actions):
    print("User: ", user_say)
    for action in best_actions:
        if action == "Q":
            bot_say = get_question(bot_utters)
        elif action == "A":
            bot_say = get_answer(bot_utters)
        else:
            raise
        print("Bot: ", bot_say)
        print("User: ", user_respond(bot_say))

In [38]:
# reinforcement learning function
def RL(user_say, n_turn = 5, user_behavior="logically"):
    actions = ["Q", "A"]
    rewards = []
    all_action_combinations = itertools.product(["Q", "A"], repeat=n_turn)
    for each_action_sequence in all_action_combinations:
        state = is_question(user_say)
        reward = 0
        for each_action in each_action_sequence:
            r, state = transition(state, each_action, user_behavior=user_behavior)
            reward += r
        rewards.append(("".join(each_action_sequence), reward))
    
    print("all possible action rewards: ")
    pp(rewards)
    best_policy = sorted(rewards, key=lambda x:x[1], reverse=True)[0][0]
    print("best action sequence: ", best_policy)
    carry_out_best_policy(user_say, best_policy)

In [39]:
RL("How are you?", user_behavior="logically")

all possible action rewards: 
[('QQQQQ', 4),
 ('QQQQA', 3),
 ('QQQAQ', 2),
 ('QQQAA', 3),
 ('QQAQQ', 2),
 ('QQAQA', 1),
 ('QQAAQ', 2),
 ('QQAAA', 3),
 ('QAQQQ', 2),
 ('QAQQA', 1),
 ('QAQAQ', 0),
 ('QAQAA', 1),
 ('QAAQQ', 2),
 ('QAAQA', 1),
 ('QAAAQ', 2),
 ('QAAAA', 3),
 ('AQQQQ', 4),
 ('AQQQA', 3),
 ('AQQAQ', 2),
 ('AQQAA', 3),
 ('AQAQQ', 2),
 ('AQAQA', 1),
 ('AQAAQ', 2),
 ('AQAAA', 3),
 ('AAQQQ', 4),
 ('AAQQA', 3),
 ('AAQAQ', 2),
 ('AAQAA', 3),
 ('AAAQQ', 4),
 ('AAAQA', 3),
 ('AAAAQ', 4),
 ('AAAAA', 5)]
best action sequence:  AAAAA
User:  How are you?
Bot:  Yes
User:  Really?
Bot:  I am good
User:  Hi?
Bot:  Yes
User:  Hi?
Bot:  I am good
User:  Hi?
Bot:  Never mind
User:  Really?


In [40]:
RL("How are you?", user_behavior="randomly")

all possible action rewards: 
[('QQQQQ', 2),
 ('QQQQA', 0),
 ('QQQAQ', 1),
 ('QQQAA', 1),
 ('QQAQQ', 4),
 ('QQAQA', 3),
 ('QQAAQ', 2),
 ('QQAAA', 2),
 ('QAQQQ', 3),
 ('QAQQA', 2),
 ('QAQAQ', 3),
 ('QAQAA', 0),
 ('QAAQQ', 3),
 ('QAAQA', 3),
 ('QAAAQ', 1),
 ('QAAAA', 2),
 ('AQQQQ', 3),
 ('AQQQA', 3),
 ('AQQAQ', 3),
 ('AQQAA', 3),
 ('AQAQQ', 2),
 ('AQAQA', 5),
 ('AQAAQ', 3),
 ('AQAAA', 4),
 ('AAQQQ', 4),
 ('AAQQA', 5),
 ('AAQAQ', 3),
 ('AAQAA', 3),
 ('AAAQQ', 3),
 ('AAAQA', 2),
 ('AAAAQ', 4),
 ('AAAAA', 3)]
best action sequence:  AQAQA
User:  How are you?
Bot:  I don't know
User:  Are you smart?
Bot:  What?
User:  25
Bot:  Maybe
User:  Really?
Bot:  Excuse me?
User:  Hey
Bot:  I am good
User:  Really?


In [43]:
RL("Not a question", user_behavior="logically")

all possible action rewards: 
[('QQQQQ', 5),
 ('QQQQA', 4),
 ('QQQAQ', 3),
 ('QQQAA', 4),
 ('QQAQQ', 3),
 ('QQAQA', 2),
 ('QQAAQ', 3),
 ('QQAAA', 4),
 ('QAQQQ', 3),
 ('QAQQA', 2),
 ('QAQAQ', 1),
 ('QAQAA', 2),
 ('QAAQQ', 3),
 ('QAAQA', 2),
 ('QAAAQ', 3),
 ('QAAAA', 4),
 ('AQQQQ', 3),
 ('AQQQA', 2),
 ('AQQAQ', 1),
 ('AQQAA', 2),
 ('AQAQQ', 1),
 ('AQAQA', 0),
 ('AQAAQ', 1),
 ('AQAAA', 2),
 ('AAQQQ', 3),
 ('AAQQA', 2),
 ('AAQAQ', 1),
 ('AAQAA', 2),
 ('AAAQQ', 3),
 ('AAAQA', 2),
 ('AAAAQ', 3),
 ('AAAAA', 4)]
best action sequence:  QQQQQ
User:  Not a question
Bot:  How are you?
User:  Hey
Bot:  Why not?
User:  25
Bot:  Can you elaborate?
User:  Hmmm
Bot:  What's that?
User:  Yo
Bot:  Can you elaborate?
User:  Hmmm


In [44]:
RL("Not a question", user_behavior="randomly")

all possible action rewards: 
[('QQQQQ', 2),
 ('QQQQA', 4),
 ('QQQAQ', 2),
 ('QQQAA', 3),
 ('QQAQQ', 3),
 ('QQAQA', 3),
 ('QQAAQ', 3),
 ('QQAAA', 1),
 ('QAQQQ', 2),
 ('QAQQA', 2),
 ('QAQAQ', 2),
 ('QAQAA', 2),
 ('QAAQQ', 5),
 ('QAAQA', 3),
 ('QAAAQ', 3),
 ('QAAAA', 4),
 ('AQQQQ', 2),
 ('AQQQA', 2),
 ('AQQAQ', 4),
 ('AQQAA', 1),
 ('AQAQQ', 2),
 ('AQAQA', 3),
 ('AQAAQ', 1),
 ('AQAAA', 2),
 ('AAQQQ', 2),
 ('AAQQA', 2),
 ('AAQAQ', 2),
 ('AAQAA', 4),
 ('AAAQQ', 1),
 ('AAAQA', 3),
 ('AAAAQ', 2),
 ('AAAAA', 2)]
best action sequence:  QAAQQ
User:  Not a question
Bot:  Can you elaborate?
User:  Hmmm
Bot:  I am good
User:  Hi?
Bot:  Maybe
User:  Hi?
Bot:  Why not?
User:  Hmmm
Bot:  Excuse me?
User:  25
