## Train a bot to respond to user questions with answers and respond to user answers with questions with Reinforcement Learning and simulated interaction

In [1]:
import random
import itertools
from pprint import pprint as pp

In [5]:
user_utters = ["Hey", "Hmmm", "Haha", "Yo", "red", "25", "Just chilling", # answers
               "Hi?", "Really?", "Why?", "What's up?", "Are you smart?"] # questions
bot_utters = ["Right", "Yes", "No", "I don't know", "Maybe", "Hi", "I am good", "Never mind", # answers
              "How are you?", "What?", "What's that?", "Why not?", "Excuse me?", "Can you elaborate?"] # questions

In [6]:
# some helper functions
def is_question(utter):
    return utter[-1] == "?"

def get_question(utters):
    answers = [i for i in utters if i[-1] != "?"]
    questions = [i for i in utters if i[-1] == "?"]
    return random.choice(questions)

def get_answer(utters):
    answers = [i for i in utters if i[-1] != "?"]
    questions = [i for i in utters if i[-1] == "?"]
    return random.choice(answers)

def user_respond(bot_utter, mode="logically"):
    if mode=="logically":
        if is_question(bot_utter):
            return get_answer(user_utters)
        else:
            return get_question(user_utters)
    elif mode=="randomly":
        return random.choice(user_utters)

#### Random Sequence (where user acts rationally and bot acts randomly)

In [7]:
for turn in range(10):
    bot_utter = random.choice(bot_utters)
    print("Bot: ", bot_utter)
    print("User: ", user_respond(bot_utter))

Bot:  Can you elaborate?
User:  25
Bot:  I don't know
User:  Why?
Bot:  No
User:  What's up?
Bot:  I am good
User:  What's up?
Bot:  Yes
User:  Really?
Bot:  What's that?
User:  25
Bot:  Hi
User:  What's up?
Bot:  No
User:  Hi?
Bot:  Hi
User:  Why?
Bot:  What's that?
User:  Just chilling


#### Reinforcement learning (where bot learned to pick the right action by maximizing rewards)

In [8]:
# define transition function
def transition(s, a, user_behavior="logically"):
    """
    given a state s and an action a, return reward and a new state s_new:
    s: True or False (is question)
    a: "Q" or "A"
    s_new: True or False
    reward: 0 or 1
    """
    # calculate reward
    if (s and a == "Q"):
        reward = 0
    elif (s and a == "A"):
        reward = 1
    elif (not s and a == "Q"):
        reward = 1
    elif (not s and a == "A"):
        reward = 0
    else:
        raise
    
    # get new state
    if a == "Q":
        bot_utter = get_question(bot_utters)
    elif a == "A":
        bot_utter = get_answer(bot_utters)
    user_utter = user_respond(bot_utter, mode=user_behavior)
    s_new = is_question(user_utter)
        
    return reward, s_new


def find_max(rewards):
    best_actions, max_reward = rewards[0]
    for actions, reward in rewards:
        if reward > max_reward:
            best_actions = actions
    return best_actions


def carry_out_best_policy(user_say, best_actions):
    print("User: ", user_say)
    for action in best_actions:
        if action == "Q":
            bot_say = get_question(bot_utters)
        elif action == "A":
            bot_say = get_answer(bot_utters)
        print("Bot: ", bot_say)
        print("User: ", user_respond(bot_say))

In [10]:
# intialize
def RL(user_say, n_turn = 5, user_behavior="logically"):
    actions = ["Q", "A"]
    rewards = []
    all_action_combinations = itertools.product(["Q", "A"], repeat=n_turn)
    for each_action_sequence in all_action_combinations:
        state = is_question(user_say)
        reward = 0
        for each_action in each_action_sequence:
            r, state = transition(state, each_action, user_behavior=user_behavior)
            reward += r
        rewards.append(("".join(each_action_sequence), reward))
        
    best_policy = find_max(rewards)
    pp("all possible action rewards: ")
    pp(rewards)
    print("best action sequence: ", best_policy)
    carry_out_best_policy(user_say, best_policy)

In [11]:
RL("How are you?", user_behavior="logically")

'all possible action rewards: '
[('QQQQQ', 4),
 ('QQQQA', 3),
 ('QQQAQ', 2),
 ('QQQAA', 3),
 ('QQAQQ', 2),
 ('QQAQA', 1),
 ('QQAAQ', 2),
 ('QQAAA', 3),
 ('QAQQQ', 2),
 ('QAQQA', 1),
 ('QAQAQ', 0),
 ('QAQAA', 1),
 ('QAAQQ', 2),
 ('QAAQA', 1),
 ('QAAAQ', 2),
 ('QAAAA', 3),
 ('AQQQQ', 4),
 ('AQQQA', 3),
 ('AQQAQ', 2),
 ('AQQAA', 3),
 ('AQAQQ', 2),
 ('AQAQA', 1),
 ('AQAAQ', 2),
 ('AQAAA', 3),
 ('AAQQQ', 4),
 ('AAQQA', 3),
 ('AAQAQ', 2),
 ('AAQAA', 3),
 ('AAAQQ', 4),
 ('AAAQA', 3),
 ('AAAAQ', 4),
 ('AAAAA', 5)]
best action sequence:  AAAAA
User:  How are you?
Bot:  Right
User:  Are you smart?
Bot:  Maybe
User:  Hi?
Bot:  Maybe
User:  What's up?
Bot:  Yes
User:  What's up?
Bot:  Hi
User:  Hi?


In [12]:
RL("How are you?", user_behavior="randomly")

'all possible action rewards: '
[('QQQQQ', 3),
 ('QQQQA', 1),
 ('QQQAQ', 2),
 ('QQQAA', 2),
 ('QQAQQ', 2),
 ('QQAQA', 3),
 ('QQAAQ', 2),
 ('QQAAA', 3),
 ('QAQQQ', 1),
 ('QAQQA', 2),
 ('QAQAQ', 2),
 ('QAQAA', 3),
 ('QAAQQ', 2),
 ('QAAQA', 3),
 ('QAAAQ', 2),
 ('QAAAA', 2),
 ('AQQQQ', 4),
 ('AQQQA', 5),
 ('AQQAQ', 5),
 ('AQQAA', 3),
 ('AQAQQ', 2),
 ('AQAQA', 4),
 ('AQAAQ', 2),
 ('AQAAA', 4),
 ('AAQQQ', 2),
 ('AAQQA', 2),
 ('AAQAQ', 5),
 ('AAQAA', 3),
 ('AAAQQ', 4),
 ('AAAQA', 2),
 ('AAAAQ', 4),
 ('AAAAA', 4)]
best action sequence:  AAAAA
User:  How are you?
Bot:  Right
User:  Hi?
Bot:  I don't know
User:  Really?
Bot:  Hi
User:  Are you smart?
Bot:  Never mind
User:  What's up?
Bot:  Hi
User:  Really?
