In [43]:
import numpy as np
from itertools import product
from scipy.optimize import fsolve


class betrand_model(object):

    def __init__(self, **kwargs):
        """Initialize game with default values"""
        # Default properties
        self.numPlayers = 2
        self.alpha = kwargs.get('alpha', 0.15)
        self.beta = kwargs.get('beta', 4e-6)
        self.delta = kwargs.get('delta', 0.95)
        
        self.c = kwargs.get('c', 1)
        self.a = kwargs.get('a', 2)
        self.a0 = kwargs.get('a0', 0)
        self.mu = kwargs.get('mu', 0.25)
        self.numiActions = kwargs.get('numiActions', 15)
        self.numActions = self.numiActions ** self.numPlayers
        
        self.true_cost_index = kwargs.get('true_cost_index', 1)


        # Derived properties
        self.indexActions = self.init_indexActions()
        self.sdim, self.s0 = self.init_state()
        self.costs = np.array([i/10 for i in range(10,30)])
        self.p_minmax = self.compute_p_competitive_monopoly()
        
        self.init_actions()
        self.trueCost = self.cost[self.true_cost_index]
        
        #self.Profits = self.init_Profits()
        #self.Q = self.init_Q()
        #self.cActions = self.init_cActions()

        
    def demand(self, p):
        e = np.exp((self.a - p) / self.mu)
        d = e / (np.sum(e) + np.exp(self.a0 / self.mu))
        return d

    
    def foc(self, p,c):
        d = self.demand(p)
        zero = 1 - (p - c) * (1 - d) / self.mu
        return np.squeeze(zero)

    
    def foc_monopoly(self, p, c):
        d = self.demand(p)
        d1 = np.flip(d)
        p1 = np.flip(p)
        zero = 1 - (p - c) * (1 - d) / self.mu + (p1 - c) * d1 / self.mu
        return np.squeeze(zero)

    
    def compute_p_competitive_monopoly(self):
        p0 = np.ones((1, self.numPlayers)) * 3 * self.c
        p_competitive = np.zeros((40,2))
        p_monopoly = np.zeros((40,2))
        for c in range(len(self.costs)):
            p_competitive[c] = fsolve(self.foc, p0,self.costs[c])
            p_monopoly[c] = fsolve(self.foc_monopoly, p0,self.costs[c])
        print(p_competitive[:,0],p_monopoly[:,0])
        return p_competitive, p_monopoly

    
    def init_actions(self):
        a = np.linspace(min(self.p_minmax[0]), max(self.p_minmax[1]), self.numiActions - 2)
        delta = a[1] - a[0]
        A = np.linspace(min(a) - delta, max(a) + delta, self.numiActions)
        self.player_actions = A
        self.player_cost = self.player_actions - self.mu*(1+1/(self.numPlayers-1 + \
                                      np.exp((self.a0-self.a+self.player_actions)/self.mu)))
        self.buyer_action_list = np.array(list(product(A, repeat=2)))
        self.buyer_cost_list = np.array(list(product(self.player_cost, repeat=2)))
        self.seller_action_list = np.array(list(product(A, repeat=2)))
        self.seller_cost_list = np.array(list(product(self.player_cost, repeat=2)))
    
    
    def init_indexActions(self):
        
        indexActions = []
        for i in range(self.numActions):
            
            indexActions.append(self.convertNumberBase(i, self.numiActions, self.numPlayers))
            
        indexActions = np.array(indexActions)
        
        return indexActions

    
    def init_state(self):
        sdim = (self.numiActions, self.numiActions)
        s0 = np.zeros(len(sdim)).astype(int)
        return sdim, s0
    
    
    def init_cActions(self):
        """Initialize cActions (used for q-learning)"""
        
        x = np.arange(self.numPlayers - 1, -1, -1)
        cActions = self.numiActions ** x
        
        return cActions

    
    def compute_profits(self, p):
        d = self.demand(p)
        pi = (p - self.c) * d
        return pi

    
    def init_Profits(self):
        Profits = np.zeros((self.numActions, self.numPlayers))
        for s in range(len(self.action_list)):
            
            Profits[s] = self.compute_profits(self.action_list[s])
        return Profits

    
    def init_Q(self):
        Q = np.zeros((self.numActions, self.numiActions, self.numPlayers))
        for iReport in range(self.numiActions):
            
            den = np.count_nonzero(self.indexActions == iReport,axis = 0,keepdims = True)*(1-self.delta)
            
            Q[:, iReport,:] = np.ma.array(self.Profits,
                                mask=(self.indexActions != iReport)).sum(axis = 0) / den
        return Q
    
    
    @staticmethod
    def convertNumberBase(n, b, l):
        '''
        Converts an integer n from base 10 to base b,
        generating a vector of integers of length l
        '''
        tmp = n
        ans = np.zeros(l)
        for i in range(1, l+1):
            ans[l-i] = int(tmp % b)
            tmp = np.floor(tmp/b)
        return ans



In [44]:
b_model = betrand_model()
b_model.index

[1.47292666 1.56293349 1.65047116 1.73556467 1.81848501 1.89973547
 1.97999787 2.06005743 2.14072027 2.22273    2.30668766 2.392985
 2.48176868 2.57295101 2.66626661 2.76135395 2.85783263 2.95535684
 3.05364088 3.15246379 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ] [1.92498092 1.95312211 1.98369561 2.0170454  2.05356663 2.09370563
 2.13795284 2.18682337 2.24081973 2.30037274 2.36576388 2.43704519
 2.51398599 2.59607646 2.682597   2.77273005 2.86567266 2.96071606
 3.05728253 3.15492782 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [99]:
import numpy as np

# from SRModel import SRModel

    
class betrand_Qlearning(object):
    '''
        Qlearning based on SR model
    '''
    def __init__(self, **kwargs):

        self.delta = kwargs.get('delta', 0.95)
        
        self.reward = kwargs.get('reward',50)
        self.penalty = kwargs.get('penalty',-50)

        self.convergedtime = kwargs.get('convergedtime',500)
        self.numSessions = kwargs.get('numSessions',1)
        self.maxIters = kwargs.get('maxIters',100000)
        self.true_value_index = kwargs.get('true_value',1)

        self.game = betrand_model(delta = self.delta,true_cost = self.true_cost_index,\
                                   reward = self.reward,penalty = self.penalty)
        self.alpha = kwargs.get('alpha', 0.25) * np.ones(self.game.numPlayers)
        self.beta = kwargs.get('beta', 0.0005) * np.ones(self.game.numPlayers)
        self.initial_state = kwargs.get('initial_state',10)

        
        
        self.delta = kwargs.get('delta',0.95)
        
               
    def computePPrime(self, strategyPrime, iters):
        
        pPrime = np.zeros(self.game.numPlayers)
        #  Greedy with probability 1-epsilon, with exponentially decreasing epsilon
        
        for iPlayer in range(self.game.numPlayers):
            
            if np.random.uniform(0,1) < np.exp(-self.beta[iPlayer]*iters):

                pPrime[iPlayer] = np.floor(self.game.numiActions*np.random.uniform(0,1))
            else:
                pPrime[iPlayer] = strategyPrime[iPlayer]
                
        return pPrime


    def q_learning(self):

        # Initializing various quantities

        indexStrategies = np.zeros((self.game.numPlayers,self.numSessions))

        indexConverge = np.zeros((self.game.numPlayers,self.numSessions))
        
        self.profit_list = np.full((self.numSessions,self.maxIters,self.game.numPlayers),np.nan)
        self.penalty_list = np.full((self.numSessions,self.maxIters,self.game.numPlayers),np.nan)

        # Loop over numSessions
        for iSession in range(self.numSessions):
            
            
            # Learning Phase
            
            # Initialization
            self.Q = self.game.Q.copy()
            self.strategyPrime = np.zeros(self.game.numPlayers)
            maxVal = np.zeros(self.game.numPlayers)
            
            strategyPrime = np.floor(np.random.uniform(0,1,(2))*self.game.numiActions)   

            state = int(np.sum(self.game.cActions*strategyPrime)) 

            
            # Loop
            itersInStrategy = 0
            convergedSession = -1
            
            strategyFix = np.zeros(self.game.numPlayers)
            
            for iters in range(self.maxIters):
                
                # Iterations counter
                
                for iPlayer in range(self.game.numPlayers):
                    
                    temp_q = self.Q[state,:,iPlayer]
                    #print(iters,temp_q)
                    maxVal[iPlayer] = np.max(temp_q)
                    #print(iters,maxVal)

                    strategyPrimeList = np.where(temp_q == maxVal[iPlayer])[0]
                    #print(iters,strategyPrimeList)
                    
                    u = np.random.uniform(0, 1)
                    strategyPrime[iPlayer] = strategyPrimeList[int(len(strategyPrimeList)*u)]
     
                state1 = int(np.sum(self.game.cActions*strategyPrime))     
                pPrime = self.computePPrime(strategyPrime, iters)
                state2 = int(np.sum(self.game.cActions*pPrime)) 

                
                maxVal = np.max(self.Q[state1,:,:],axis = 0)
                
                for iPlayer in range(self.game.numPlayers):
                    
                    # Q matrices and strategies update
                    oldq = self.Q[state, int(pPrime[iPlayer]), iPlayer]
                    newq = oldq + self.alpha[iPlayer] * (self.game.Profits[state2, iPlayer] + \
                                                self.delta * maxVal[iPlayer] - oldq)
                    #print('pppppp',state1)
                    self.Q[state, int(pPrime[iPlayer]), iPlayer] = newq
                    
                state = state1
                #print(strategyPrime,pPrime)
                # Assessing convergence
                if np.array_equiv(strategyPrime, pPrime):
                    itersInStrategy = itersInStrategy + 1
                else:
                    itersInStrategy = 1

                # Check for convergence in strategy
                if convergedSession == -1:
                    # Maximum number of iterations exceeded
                    if iters >= self.maxIters - 1:
                        convergedSession = 0


                    if itersInStrategy == self.convergedtime:
                        convergedSession = 1

                    strategyFix = strategyPrime.copy()               #testing

                # Check for loop exit criteria
                if convergedSession != -1:
                    break
                # if no converge yet, update and iterate
    
            indexConverge[:,iSession] = (convergedSession, iters)
            indexStrategies[:,iSession] = strategyFix
            self.indexConverge = indexConverge
            self.indexStrategies = indexStrategies
            #print(iSession,convergedSession)

        return indexConverge, indexStrategies



    

In [101]:
K = betrand_Qlearning(numSessions = 1,delta = 0.95,alpha = 0.25,beta = 0.0001,
                 true_cost = 1,penalty = 0,reward = 0,
                 convergedtime = 500,maxIters = 1000000)
R = K.q_learning()



print(R)
#print(K.game.buyerActions[int(K.indexStrategies[0,:])],K.game.buyerActions[int(K.indexStrategies[1,:])])

(array([[1.0000e+00],
       [5.5623e+04]]), array([[6.],
       [4.]]))


In [28]:
b_model.cActions

array([1.43525547, 1.47292666, 1.51059785, 1.54826904, 1.58594022,
       1.62361141, 1.6612826 , 1.69895379, 1.73662498, 1.77429617,
       1.81196735, 1.84963854, 1.88730973, 1.92498092, 1.96265211])

In [29]:
np.array(list(product(b_model.cActions, repeat=2)))

array([[1.43525547, 1.43525547],
       [1.43525547, 1.47292666],
       [1.43525547, 1.51059785],
       [1.43525547, 1.54826904],
       [1.43525547, 1.58594022],
       [1.43525547, 1.62361141],
       [1.43525547, 1.6612826 ],
       [1.43525547, 1.69895379],
       [1.43525547, 1.73662498],
       [1.43525547, 1.77429617],
       [1.43525547, 1.81196735],
       [1.43525547, 1.84963854],
       [1.43525547, 1.88730973],
       [1.43525547, 1.92498092],
       [1.43525547, 1.96265211],
       [1.47292666, 1.43525547],
       [1.47292666, 1.47292666],
       [1.47292666, 1.51059785],
       [1.47292666, 1.54826904],
       [1.47292666, 1.58594022],
       [1.47292666, 1.62361141],
       [1.47292666, 1.6612826 ],
       [1.47292666, 1.69895379],
       [1.47292666, 1.73662498],
       [1.47292666, 1.77429617],
       [1.47292666, 1.81196735],
       [1.47292666, 1.84963854],
       [1.47292666, 1.88730973],
       [1.47292666, 1.92498092],
       [1.47292666, 1.96265211],
       [1.

In [55]:
int(np.sum(K.game.cActions*[1,2])) 

1146

In [57]:
K.game.cActions*[1,2]

array([[1.43525547, 2.87051094],
       [1.43525547, 2.94585332],
       [1.43525547, 3.0211957 ],
       [1.43525547, 3.09653807],
       [1.43525547, 3.17188045],
       [1.43525547, 3.24722283],
       [1.43525547, 3.3225652 ],
       [1.43525547, 3.39790758],
       [1.43525547, 3.47324996],
       [1.43525547, 3.54859233],
       [1.43525547, 3.62393471],
       [1.43525547, 3.69927709],
       [1.43525547, 3.77461946],
       [1.43525547, 3.84996184],
       [1.43525547, 3.92530421],
       [1.47292666, 2.87051094],
       [1.47292666, 2.94585332],
       [1.47292666, 3.0211957 ],
       [1.47292666, 3.09653807],
       [1.47292666, 3.17188045],
       [1.47292666, 3.24722283],
       [1.47292666, 3.3225652 ],
       [1.47292666, 3.39790758],
       [1.47292666, 3.47324996],
       [1.47292666, 3.54859233],
       [1.47292666, 3.62393471],
       [1.47292666, 3.69927709],
       [1.47292666, 3.77461946],
       [1.47292666, 3.84996184],
       [1.47292666, 3.92530421],
       [1.

In [83]:
        for s in self.action_list:
            print(game.compute_profits(p))

array([[0.00226462, 0.00226462],
       [0.00226462, 0.00211642],
       [0.00226462, 0.00196538],
       [0.00226462, 0.00181518],
       [0.00226462, 0.00166854],
       [0.00226462, 0.00152741],
       [0.00226462, 0.00139312],
       [0.00226462, 0.00126651],
       [0.00226462, 0.00114806],
       [0.00226462, 0.00103797],
       [0.00226462, 0.00093621],
       [0.00226462, 0.00084261],
       [0.00226462, 0.00075688],
       [0.00226462, 0.00067865],
       [0.00226462, 0.00060749],
       [0.00211642, 0.00226462],
       [0.00211642, 0.00211642],
       [0.00211642, 0.00196538],
       [0.00211642, 0.00181518],
       [0.00211642, 0.00166854],
       [0.00211642, 0.00152741],
       [0.00211642, 0.00139312],
       [0.00211642, 0.00126651],
       [0.00211642, 0.00114806],
       [0.00211642, 0.00103797],
       [0.00211642, 0.00093621],
       [0.00211642, 0.00084261],
       [0.00211642, 0.00075688],
       [0.00211642, 0.00067865],
       [0.00211642, 0.00060749],
       [0.

In [89]:
        for s in b_model.action_list:
            print(b_model.compute_profits(s))

[0.20682553 0.20682553]
[0.22155186 0.2070539 ]
[0.23600529 0.20482047]
[0.2500352  0.20041303]
[0.26350889 0.19415033]
[0.27631598 0.18636658]
[0.28837091 0.17739646]
[0.29961382 0.16756186]
[0.31000971 0.15716109]
[0.31954627 0.14646121]
[0.32823096 0.13569327]
[0.3360875  0.12505043]
[0.34315225 0.11468832]
[0.34947076 0.10472711]
[0.35509455 0.0952549 ]
[0.2070539  0.22155186]
[0.22292666 0.22292666]
[0.23866335 0.22163041]
[0.25409097 0.21792449]
[0.26905004 0.2121133 ]
[0.2834008 0.2045291]
[0.29702771 0.19551608]
[0.30984202 0.18541533]
[0.32178241 0.17455169]
[0.33281401 0.16322342]
[0.34292597 0.15169495]
[0.35212819 0.14019278]
[0.36044761 0.12890402]
[0.36792429 0.11797738]
[0.37460776 0.10752566]
[0.20482047 0.23600529]
[0.22163041 0.23866335]
[0.23846389 0.23846389]
[0.25513124 0.23563339]
[0.27145021 0.23045249]
[0.2872537  0.22324207]
[0.3023962 0.2143474]
[0.31675832 0.20412196]
[0.33024927 0.19291233]
[0.34280734 0.18104534]
[0.35439862 0.16881816]
[0.36501435 0.156491

In [90]:
b_model.compute_profits(b_model.action_list)

array([[0.00226462, 0.00226462],
       [0.00226462, 0.00211642],
       [0.00226462, 0.00196538],
       [0.00226462, 0.00181518],
       [0.00226462, 0.00166854],
       [0.00226462, 0.00152741],
       [0.00226462, 0.00139312],
       [0.00226462, 0.00126651],
       [0.00226462, 0.00114806],
       [0.00226462, 0.00103797],
       [0.00226462, 0.00093621],
       [0.00226462, 0.00084261],
       [0.00226462, 0.00075688],
       [0.00226462, 0.00067865],
       [0.00226462, 0.00060749],
       [0.00211642, 0.00226462],
       [0.00211642, 0.00211642],
       [0.00211642, 0.00196538],
       [0.00211642, 0.00181518],
       [0.00211642, 0.00166854],
       [0.00211642, 0.00152741],
       [0.00211642, 0.00139312],
       [0.00211642, 0.00126651],
       [0.00211642, 0.00114806],
       [0.00211642, 0.00103797],
       [0.00211642, 0.00093621],
       [0.00211642, 0.00084261],
       [0.00211642, 0.00075688],
       [0.00211642, 0.00067865],
       [0.00211642, 0.00060749],
       [0.