# Если в DMF использовать вероятности из бустинга

In [137]:
import numpy as np
from copy import copy
import pandas as pd
from tqdm.auto import tqdm

from scipy.integrate import quad
from scipy.stats import genextreme

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate

from statsmodels.discrete.discrete_model import MNLogit

## Симуляции

In [122]:
beta_spend = np.array([1, 0.1, 0.0001, 1])
betas = np.array([[0.1, 0.000025, 0.3, 0.01],
                  [0.2, 0.000015, 0.2, 0.015],
                  [3, -0.00002, 0.5, -0.02]])

In [128]:
def generate_sample(n, betas=betas, beta_spend=beta_spend, verbose=False):
    
    ### Регрессоры
    x0 = np.ones(shape=n)
    x1 = np.exp(np.random.normal(loc=10, scale=0.7, size=n))
    x2 = np.random.poisson(lam=3, size=n)
    x2[x2 > 5] = 5
    x3 = np.round(np.random.uniform(low=20, high=100, size=n))
    x4 = np.random.poisson(lam=3, size=n)
    x4[x4 > 5] = 5
    df = pd.DataFrame(zip(x0, x1, x2, x3, x4),
                      columns=['const', 'income', 'health', 'age', 'drive'])
    
    ### Линейные индексы
    eps = genextreme.rvs(c=0, size=(n, 3))
    y_li = df[['const', 'income', 'health', 'age']] @ betas.T
    df[['y_star_Car', 'y_star_Taxi', 'y_star_Public']] = y_li + eps
    df['transport'] = np.argmax(np.array(df[['y_star_Car', 'y_star_Taxi', 'y_star_Public']]), axis=1)

    if verbose:
        print(df.transport.value_counts())
    
    ### Расходы
    rho = np.array([0.64, -0.25, 0.14])
    mevd, __ = quad(lambda x: genextreme.pdf(x, c=0) * x, -100, 100)
    m2evd, __ = quad(lambda x: genextreme.pdf(x, c=0) * x ** 2, -100, 100)
    vevd = m2evd - mevd ** 2
    vevd_rho = np.sum(vevd * rho ** 2)
    adj = np.sqrt(6) / np.pi
    sigma = 4
    
    eps_spend = sigma * adj * (eps - mevd) @ rho + \
                np.random.normal(size=n, loc=0, scale=np.sqrt(sigma ** 2 - (sigma * adj * np.sqrt(vevd_rho) ** 2)))

    spend_li = df[['const', 'age', 'income', 'drive']] @ beta_spend.T
    df['spend'] = spend_li + eps_spend
    
    df.loc[df['transport'] != 0, 'spend'] = np.nan
    
    return df

In [133]:
np.random.seed(999)

df = generate_sample(10000, verbose=True)
df.head()

2    4624
0    2928
1    2448
Name: transport, dtype: int64


Unnamed: 0,const,income,health,age,drive,y_star_Car,y_star_Taxi,y_star_Public,transport,spend
0,1.0,24076.956273,3,56.0,4,1.274725,1.702839,3.739565,2,
1,1.0,58766.286919,5,51.0,1,9.176211,3.430662,3.694641,0,27.537197
2,1.0,27456.837103,0,38.0,4,1.475336,2.004115,2.355695,2,
3,1.0,12077.331985,4,31.0,3,1.995143,3.621725,4.04678,2,
4,1.0,18282.615127,2,88.0,4,0.608748,2.733178,0.197868,1,


In [134]:
X, y = df[['income', 'health', 'age']], df['transport']
all_coefs = []
        
## Обычная линейная регрессия
df_no_nans = df.dropna()
X_spend = df_no_nans[['age', 'income', 'drive']]
y_spend = df_no_nans['spend']
lm = LinearRegression().fit(X_spend, y_spend)
all_coefs.append(['МНК', lm.intercept_] + list(lm.coef_))

In [135]:
## MNLogit statsmodels
lr = MNLogit(y, X).fit()
prob = lr.predict()

## Лямбды
df['lambda1'] = -np.log(prob[:, 0])
df['lambda2'] = prob[:, 1] * np.log(prob[:, 1]) / (1 - prob[:, 1])
df['lambda3'] = prob[:, 2] * np.log(prob[:, 2]) / (1 - prob[:, 2])

df_no_nans = df.dropna()
X_spend = df_no_nans[['age', 'income', 'drive']]
y_spend = df_no_nans['spend']

## Линейная модель
dmf = LinearRegression().fit(df_no_nans[['age', 'income', 'drive', 
                                         'lambda1', 'lambda2', 'lambda3']], y_spend)
all_coefs.append(['Дурбин-МакФадден', dmf.intercept_] + list(dmf.coef_[:3]))

Optimization terminated successfully.
         Current function value: 0.998481
         Iterations 5


In [136]:
all_coefs = []
all_coefs.append(['Истина', 1, 0.1, 0.0001, 1])
all_coefs.append(['МНК', lm.intercept_] + list(lm.coef_))
all_coefs.append(['Дурбин-МакФадден', dmf.intercept_] + list(dmf.coef_[:3]))

pd.DataFrame(all_coefs, columns=['Метод', 'const', 'b1', 'b2', 'b3'])

Unnamed: 0,Метод,const,b1,b2,b3
0,Истина,1.0,0.1,0.0001,1.0
1,МНК,5.231897,0.081672,8.1e-05,1.002064
2,Дурбин-МакФадден,-0.691686,0.070402,0.000108,1.000659
