<a href="https://colab.research.google.com/github/Lenguist/insight-game-ai/blob/main/simple_sim_united.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Description moved to readme

# q learn

In [1]:
class Episode(object):
  def __init__(self, buyer, seller, verbose=True):
    self.buyer = buyer
    self.seller = seller
    self.verbose = verbose # whether to print info

  def negotiation_round(self):
    if self.verbose:
      print(f"Curr state: {self.seller.state}")

    offer = self.seller.make_offer()
    decision = self.buyer.check_offer(offer)
    new_state = {"last-offer":offer, "offers-made":self.seller.state["offers-made"]+1}
    self.seller.update_table(offer, decision, new_state)
    self.seller.state = new_state

    if self.verbose:
      print(f"Seller made offer of {offer}. The buyer decided to {decision}")
    return decision

  def run_episode(self, episode_num):
    decision = ""
    while decision != "walk away" and decision != "accept offer":
      decision = self.negotiation_round()

    if decision == "accept offer":
      final_offer = self.seller.state["last-offer"]
      profit = final_offer - self.seller.value
      if self.verbose:
        print(f"Deal made at {final_offer}")
        print(f"Buyer's max_price was {self.buyer.maxprice}")

    else:
      if self.verbose:
        print(f"No deal made - no profit.")
        print(f"Buyer's max_price was {self.buyer.maxprice}")
      profit = 0
    # reset seller state
    self.seller.reset_state(episode_num)
    return profit

In [2]:
"Seller classes"
import random
import numpy as np
import math

class Seller(object):
  def __init__(self, value, init_offer):
    self.value = value
    self.init_offer = init_offer
    self.state = {"last-offer": self.init_offer, "offers-made":0} # initial state
  def make_offer(self):
    raise NotImplementedError("Subclasses should implement this method.")
  def reset_state(self):
    self.state = {"last-offer":self.init_offer, "offers-made":0}
  def update_table(self):
    pass
  
class QLearningSeller(Seller):
  def __init__(self, value, init_offer):
    super().__init__(value, init_offer)
    action_space_size = self.init_offer - self.value
    state_space_size = (self.init_offer - self.value) ** 2
    self.q_table = np.zeros((state_space_size, action_space_size))

    # Q-Learning parameters
    self.learning_rate = 0.1
    self.discount_rate = 0.99
    self.exploration_rate = 1
    self.max_exploration_rate = 1
    self.min_exploration_rate = 0.01
    self.exploration_decay_rate = 0.001

  def reset_state(self,episode_num):
    super().reset_state()
    #self.exploration_rate = self.min_exploration_rate + (self.max_exploration_rate - self.min_exploration_rate) * np.exp(-self.exploration_decay_rate*episode_num)
    self.exploration_rate = 0.1                                                                 
    # exploration_rate = self.min_exploration_rate + \
    # (self.max_exploration_rate - self.min_exploration_rate) * np.exp(-self.exploration_decay_rate*self.episode)

  
  def make_offer(self):
    random_number = random.uniform(0, 1)
    if random_number > self.exploration_rate:
      offer = np.argmax(self.q_table[self.get_state_array_number(self.state), :]) + (self.value+1)
    else:
      offer = random.randint(self.value+1,self.state['last-offer']-1)
    return offer
  
  # helper methods
  def get_state_array_number(self, state):
    return state['last-offer']-(self.value+1)+(self.init_offer-self.value)*state['offers-made']

  def from_array_num_get_state(self, array_num):
    offers_made = math.floor(array_num/(self.init_offer-self.value))
    last_offer =  array_num % (self.init_offer-self.value) + (self.value + 1)
    return {"last-offer":last_offer, "offers-made":offers_made}


  def update_table(self, offer, decision, new_state):
    if decision == 'accept offer':
      reward = offer - self.value
    else:
      reward = 0
    
    state_array_number = self.get_state_array_number(self.state)
    new_state_array_number = self.get_state_array_number(new_state)
    self.q_table[state_array_number, offer-(self.value+1)] = self.q_table[state_array_number, offer-(self.value+1)] * (1 - self.learning_rate) + \
                                  self.learning_rate * (reward + self.discount_rate * np.max(self.q_table[new_state_array_number, :]))



In [4]:
# imp_incr constant, max_price random
def basic_simulation(value, range_min, range_max, imp_incr, rounds, seller, verbose=False):
  total_profit = 0
  profit_tracker = []
  round_cnt = 0
  for i in range(rounds):
    maxprice = int(random.uniform(range_min, range_max+1))
    buyer = Buyer(maxprice, 0, imp_incr)
    episode = Episode(buyer, seller, verbose=False)
    profit = episode.run_episode(i)
    total_profit += profit
  average_return = total_profit/rounds

  return average_return

In [6]:
from buyer import Buyer
# params that are needed to initialize seller
value = 10
range_min = 11
range_max = 30
rounds = 500

init_offer=range_max

seller = QLearningSeller(value=value, init_offer=init_offer)
rewards_all_episodes = []
for i in range(10):
    result = basic_simulation(value,
                            range_min,
                            range_max,
                            imp_incr=  0.1,
                            rounds = rounds,
                            seller=seller,
                            verbose=False)
    rewards_all_episodes.append(result)
    print(seller.q_table)
    print(rewards_all_episodes)



[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[6.08]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[6.08, 5.82]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[6.08, 5.82, 5.902]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[6.08, 5.82, 5.902, 5.936]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[6.08, 5.82, 5.902, 5.936, 6.072]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ...

In [7]:
import pandas as pd
import numpy as np
q = pd.DataFrame(seller.q_table)



# Filtering dataframe to get only non-zero rows
non_zero_df = q.loc[(q!=0).any(axis=1)]

print(non_zero_df)


          0         1         2         3         4         5         6   \
19  0.961848  1.742428  2.353745  3.311961  3.789522  4.391881  5.228517   
21  0.271000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
22  0.181000  0.023612  0.000000  0.000000  0.000000  0.000000  0.000000   
23  0.100000  0.377290  0.000000  0.000000  0.000000  0.000000  0.000000   
24  0.468559  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
25  0.181000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
26  0.569533  0.015236  0.000000  0.000000  0.000000  0.000000  0.000000   
27  0.814698  1.967174  1.487167  2.145687  2.203230  2.253767  1.609757   
28  0.271000  0.223612  0.804608  0.000000  0.000000  0.000000  0.000000   
29  0.642989  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
30  0.839921  0.000000  0.616160  0.000000  0.000000  0.000000  0.000000   
31  0.669241  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
32  0.521703