In [9]:
import numpy as np
import pandas as pd
import scipy.stats as stats

import logit_demand_archive as ld
import importlib
importlib.reload(ld)

<module 'logit_demand' from '/Users/jingyuanwang/GitHub/NU450_HW/coding_tutorial/Logit_demand/logit_demand.py'>

In [None]:
# ------------------------------------------------------------------------
# NOTE
# ------------------------------------------------------------------------
# Purpose: Test function loglikelihood in logit_demand_archive.py
# archive, because notation different from the requirements
#
# Definition of several variables in this file:
# n: number of consumers
# m: number of products
#    outside option normalized to 0 and not included among the choices
# k: length of beta, number of variables
# ------------------------------------------------------------------------

In [2]:
# initialize
# number of consumers
n = 100
# number of choices
m = 8
# variables: income, age, price
k = 3

In [18]:
# I. initialize sample ---------------------------------------------------
    # Y: binary-vector of choices, n*m-by-1
    # I: vector of consumer index, n*m-by-1
    # X: matrix of consumer choice attributes, n*m-by-k
    # beta: coefficients, k-by-1
    

# 1. variables: income, age, price 
# (1) set up variables
# income: normal(60,30) $000, minimum income $5000
income = np.random.normal(60,30,size=n)
income[income <= 5] = 5
# age: uniform 10-50
age = np.floor(np.random.uniform(low =1, high=5,size=n)*10).astype(int)
# price of each good
price = np.random.uniform(low=0, high=1, size=m)*100

# (2) stack to n*m-by-k
df_consumers = pd.DataFrame(np.vstack([income,age]).T, columns=['income','age'])
df_consumers = pd.concat([df_consumers]*m, 
                         keys=list(range(0,m)), 
                         names=['choices', 'consumer ID']).swaplevel(i=0, j=1, axis=0)
df_choices = pd.DataFrame(price, columns=['price'])
df_choices = pd.concat([df_choices]*n,
                        keys=list(range(0,n)),
                        names=['consumer ID','choices'])

df = pd.merge(df_consumers,df_choices,left_index = True, right_index = True).sort_index(0)

In [4]:
# 2. Choice, dependent variable
# data generating process: Y = I{beta*X + epsilon > 100} (need to re-define)
beta_true = np.array([[5,-1,-4]]).T
eps = np.random.normal(0,30,size=len(df))
df['eps'] = eps
df['utility'] = df[['income','age','price']].values.dot(beta_true) 
df['utility'] = df['utility'] + df['eps']
df['Y'] = (df['utility'] > 0).astype(int)

# 3. index
df = df.reset_index(level = [0,1])
df.index.name = 'I'

In [5]:
df.head()

Unnamed: 0_level_0,consumer ID,choices,income,age,price,eps,utility,Y
I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,71.471846,29.0,98.92303,63.572338,-3.760551,0
1,0,1,71.471846,29.0,38.611498,-0.092675,173.820562,1
2,0,2,71.471846,29.0,88.593111,9.857482,-16.155734,0
3,0,3,71.471846,29.0,76.727049,11.863026,33.31406,1
4,0,4,71.471846,29.0,72.20278,14.688416,54.236525,1


In [12]:
# II. try functions ---------------------------------------------------
beta = np.array([[5,-1,-4]]).T

precision = 10**-100

likelihood_y_1=pd.Series(map(lambda x: (1-stats.norm.cdf(x, loc=0, scale=30)), 
              df[['income','age','price']].values.dot(beta)*(-1) ))
likelihood_y_1[likelihood_y_1==0] = min(precision, likelihood_y_1[likelihood_y_1!=0].min()) 

likelihood_y_0=pd.Series(map(lambda x: stats.norm.cdf(x, loc=0, scale=30), 
              df[['income','age','price']].values.dot(beta)*(-1) ))
likelihood_y_0[likelihood_y_0==0] = min(precision, likelihood_y_0[likelihood_y_0!=0].min()) 

# 避免只要凑出一个obs == 0. log(~) = -inf, 就没法儿比大小了

loglikelihood = -np.sum(list(map(lambda y, p1, p0: y*np.log(p1) + (1-y)*np.log(p0),
                                df['Y'],likelihood_y_1, likelihood_y_0)))
loglikelihood

88.68366658455813

In [13]:
ld.loglikelihood(df['Y'],df.index,df[['income','age','price']],beta)

88.68366658455813