# Если в DMF использовать вероятности из бустинга

In [80]:
import numpy as np
from copy import copy
import pandas as pd
from tqdm.auto import tqdm

from scipy.integrate import quad
from scipy.stats import genextreme

from statsmodels.discrete.discrete_model import MNLogit
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor

from catboost import CatBoostClassifier

from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, accuracy_score


## Подготовка

In [2]:
beta_spend = np.array([1, 0.1, 0.0001, 1])
betas = np.array([[0.1, 0.000025, 0.3, 0.01],
                  [0.2, 0.000015, 0.2, 0.015],
                  [3, -0.00002, 0.5, -0.02]])

In [3]:
def generate_sample(n, betas=betas, beta_spend=beta_spend, verbose=False):
    
    ### Регрессоры
    x0 = np.ones(shape=n)
    x1 = np.exp(np.random.normal(loc=10, scale=0.7, size=n))
    x2 = np.random.poisson(lam=3, size=n)
    x2[x2 > 5] = 5
    x3 = np.round(np.random.uniform(low=20, high=100, size=n))
    x4 = np.random.poisson(lam=3, size=n)
    x4[x4 > 5] = 5
    df = pd.DataFrame(zip(x0, x1, x2, x3, x4),
                      columns=['const', 'income', 'health', 'age', 'drive'])
    
    ### Линейные индексы
    eps = genextreme.rvs(c=0, size=(n, 3))
    y_li = df[['const', 'income', 'health', 'age']] @ betas.T
    df[['y_star_Car', 'y_star_Taxi', 'y_star_Public']] = y_li + eps
    df['transport'] = np.argmax(np.array(df[['y_star_Car', 'y_star_Taxi', 'y_star_Public']]), axis=1)

    if verbose:
        print(df.transport.value_counts())
    
    ### Расходы
    rho = np.array([0.64, -0.25, 0.14])
    mevd, __ = quad(lambda x: genextreme.pdf(x, c=0) * x, -100, 100)
    m2evd, __ = quad(lambda x: genextreme.pdf(x, c=0) * x ** 2, -100, 100)
    vevd = m2evd - mevd ** 2
    vevd_rho = np.sum(vevd * rho ** 2)
    adj = np.sqrt(6) / np.pi
    sigma = 4
    
    eps_spend = sigma * adj * (eps - mevd) @ rho + \
                np.random.normal(size=n, loc=0, scale=np.sqrt(sigma ** 2 - (sigma * adj * np.sqrt(vevd_rho) ** 2)))

    spend_li = df[['const', 'age', 'income', 'drive']] @ beta_spend.T
    df['spend'] = spend_li + eps_spend
    
    df.loc[df['transport'] != 0, 'spend'] = np.nan
    
    return df

In [19]:
np.random.seed(4)

df = generate_sample(1000, verbose=True)
df.head()

2    481
0    289
1    230
Name: transport, dtype: int64


Unnamed: 0,const,income,health,age,drive,y_star_Car,y_star_Taxi,y_star_Public,transport,spend
0,1.0,22820.013044,3,32.0,4,4.662237,2.942691,6.678108,2,
1,1.0,31255.978012,3,45.0,0,2.116802,5.361839,2.351739,1,
2,1.0,10969.387836,5,68.0,3,2.718885,2.887521,3.999501,2,
3,1.0,35793.405654,2,21.0,5,2.158243,2.879407,4.12244,2,
4,1.0,16435.336097,3,51.0,3,2.464228,1.464279,3.759046,2,


In [20]:
X, y = df[['income', 'health', 'age']], df['transport']
all_coefs = []
        
## Обычная линейная регрессия
df_no_nans = df.dropna()
X_spend = df_no_nans[['age', 'income', 'drive']]
y_spend = df_no_nans['spend']
lm = LinearRegression().fit(X_spend, y_spend)
all_coefs.append(['МНК', lm.intercept_] + list(lm.coef_))

In [21]:
# # LogReg sklearn
# lr = LogisticRegression().fit(X, y)
# prob = lr.predict_proba(X)

# MNLogit statsmodels
lr = MNLogit(y, X).fit()
prob = lr.predict()

## Лямбды
df['lambda1'] = -np.log(prob[:, 0])
df['lambda2'] = prob[:, 1] * np.log(prob[:, 1]) / (1 - prob[:, 1])
df['lambda3'] = prob[:, 2] * np.log(prob[:, 2]) / (1 - prob[:, 2])

df_no_nans = df.dropna()
X_spend = df_no_nans[['age', 'income', 'drive']]
y_spend = df_no_nans['spend']

## Линейная модель
dmf = LinearRegression().fit(df_no_nans[['age', 'income', 'drive', 
                                         'lambda1', 'lambda2', 'lambda3']], y_spend)
all_coefs.append(['Дурбин-МакФадден', dmf.intercept_] + list(dmf.coef_[:3]))

In [22]:
all_coefs = []
all_coefs.append(['Истина', 1, 0.1, 0.0001, 1])
all_coefs.append(['МНК', lm.intercept_] + list(lm.coef_))
all_coefs.append(['Дурбин-МакФадден', dmf.intercept_] + list(dmf.coef_[:3]))

pd.DataFrame(all_coefs, columns=['method', 'const', 'b1', 'b2', 'b3'])

Unnamed: 0,method,const,b1,b2,b3
0,Истина,1.0,0.1,0.0001,1.0
1,МНК,5.954993,0.076271,8e-05,0.705369
2,Дурбин-МакФадден,137.78069,0.127142,-0.000279,0.685245


## Основная часть

In [108]:
def simulate(n_simulations, n_size):
    
    ols_coefs = []
    dmf_coefs = []
    lr_metrics = []
    boost_metrics = []
    ml_coefs = []

    for i in tqdm(range(n_simulations)):
        
        ### Генерируем данные
        df = generate_sample(n_size)
        X, y = df[['income', 'health', 'age']], df['transport']
        df_no_nans = df.dropna()
        X_spend = df_no_nans[['age', 'income', 'drive']]
        y_spend = df_no_nans['spend']
        
        ### МНК
        lm = LinearRegression().fit(X_spend, y_spend)
        ols_coefs.append([lm.intercept_] + list(lm.coef_))
        
        ### DMF
        ## Вероятности
        lr = MNLogit(y, X).fit(disp=0)
        prob_lr = lr.predict()
        pred_lr = np.argmax(prob_lr, axis=1)
        scores_lr = [roc_auc_score(y, prob_lr, multi_class='ovo'),
                  roc_auc_score(y, prob_lr, multi_class='ovr'),
                  accuracy_score(y, pred_lr)]
        lr_metrics.append(scores_lr)
        
        ## Лямбды
        df['lambda1'] = -np.log(prob_lr[:, 0])
        df['lambda2'] = prob_lr[:, 1] * np.log(prob_lr[:, 1]) / (1 - prob_lr[:, 1])
        df['lambda3'] = prob_lr[:, 2] * np.log(prob_lr[:, 2]) / (1 - prob_lr[:, 2])
        df_no_nans = df.dropna()
        X_spend = df_no_nans[['age', 'income', 'drive']]
        y_spend = df_no_nans['spend']
    
        ## Линейная модель
        dmf = LinearRegression().fit(df_no_nans[['age', 'income', 'drive', 
                                                 'lambda1', 'lambda2', 'lambda3']], y_spend)
        dmf_coefs.append([dmf.intercept_] + list(dmf.coef_[:3]))
        
        ### Бустинг
        ## Вероятности
        boosting = CatBoostClassifier(iterations=10, max_depth=3, verbose=0)
        boosting.fit(X, y)
        prob_boost = boosting.predict_proba(X)
        pred_boost = boosting.predict(X)
        scores_boost = [roc_auc_score(y, prob_boost, multi_class='ovo'),
                  roc_auc_score(y, prob_boost, multi_class='ovr'),
                  accuracy_score(y, pred_boost)]
        boost_metrics.append(scores_boost)
        
        ## Лямбды
        df['lambda1'] = -np.log(prob_boost[:, 0])
        df['lambda2'] = prob_boost[:, 1] * np.log(prob_boost[:, 1]) / (1 - prob_boost[:, 1])
        df['lambda3'] = prob_boost[:, 2] * np.log(prob_boost[:, 2]) / (1 - prob_boost[:, 2])
        df_no_nans = df.dropna()
        X_spend = df_no_nans[['age', 'income', 'drive']]
        y_spend = df_no_nans['spend']
    
        ## Линейная модель
        dmf_ml = LinearRegression().fit(df_no_nans[['age', 'income', 'drive', 
                                                 'lambda1', 'lambda2', 'lambda3']], y_spend)
        ml_coefs.append([dmf_ml.intercept_] + list(dmf_ml.coef_[:3]))
        
    return ols_coefs, dmf_coefs, ml_coefs, lr_metrics, boost_metrics
        

In [109]:
def calc_coef_metrics(ests, true, model_name):
    return [model_name] + list(np.sqrt(np.mean((np.array(ests) - true) ** 2, axis=0))) + \
           list(100 * np.mean(np.abs(np.array(ests) - true) / true, axis=0))

def calc_class_metrics(scores, model_name):
    return [model_name] + list(np.mean(scores, axis=0))


## Симуляции

### Выборка размером 1000

In [110]:
np.random.seed(3)

n_simulations = 100
n_size = 1000

ols_coefs, dmf_coefs, ml_coefs, lr_metrics, boost_metrics = simulate(n_simulations, n_size)

  0%|          | 0/100 [00:00<?, ?it/s]

In [111]:
cols_names = ['METHOD', 'RMSE0', 'RMSE1', 'RMSE2', 'RMSE3', 'MAPE0', 'MAPE1', 'MAPE2', 'MAPE3']

results = [calc_coef_metrics(ols_coefs, beta_spend, model_name='OLS'),
           calc_coef_metrics(dmf_coefs, beta_spend, model_name='DMF'),
           calc_coef_metrics(ml_coefs, beta_spend, model_name='ML+DMF')]

pd.DataFrame(results, columns=cols_names)

Unnamed: 0,METHOD,RMSE0,RMSE1,RMSE2,RMSE3,MAPE0,MAPE1,MAPE2,MAPE3
0,OLS,4.294696,0.018879,2.4e-05,0.184355,417.748336,15.470117,22.34378,15.32278
1,DMF,37.898301,0.046763,0.000115,0.185335,2750.187369,36.039133,88.297132,15.212676
2,ML+DMF,6.819891,0.018401,2.4e-05,0.183625,575.202014,15.283977,19.584588,15.255694


In [112]:
cols_names2 = ['METHOD', 'ROC-AUC-OVO', 'ROC-AUC-OVR', 'ACCURACY']

class_metrics = [calc_class_metrics(lr_metrics, 'LogReg'),
                 calc_class_metrics(boost_metrics, 'Boosting')]


pd.DataFrame(class_metrics, columns=cols_names2)

Unnamed: 0,METHOD,ROC-AUC-OVO,ROC-AUC-OVR,ACCURACY
0,LogReg,0.639269,0.652239,0.51069
1,Boosting,0.709452,0.726554,0.57174


### Выборка размером 10 000

In [114]:
np.random.seed(666)

n_simulations = 100
n_size = 10000

ols_coefs, dmf_coefs, ml_coefs, lr_metrics, boost_metrics = simulate(n_simulations, n_size)

  0%|          | 0/100 [00:00<?, ?it/s]

In [116]:
cols_names1 = ['METHOD', 'RMSE0', 'RMSE1', 'RMSE2', 'RMSE3', 'MAPE0', 'MAPE1', 'MAPE2', 'MAPE3']

results = [calc_coef_metrics(ols_coefs, beta_spend, model_name='OLS'),
           calc_coef_metrics(dmf_coefs, beta_spend, model_name='DMF'),
           calc_coef_metrics(ml_coefs, beta_spend, model_name='ML+DMF')]

pd.DataFrame(results, columns=cols_names1)

Unnamed: 0,METHOD,RMSE0,RMSE1,RMSE2,RMSE3,MAPE0,MAPE1,MAPE2,MAPE3
0,OLS,4.145204,0.014282,2.2e-05,0.049932,413.092419,13.779424,22.083714,3.975306
1,DMF,9.557446,0.024918,3.2e-05,0.04972,747.726809,22.761584,26.121042,3.969773
2,ML+DMF,3.377971,0.006836,1.4e-05,0.04986,292.211334,5.484818,11.863996,3.974982


In [117]:
cols_names2 = ['METHOD', 'ROC-AUC-OVO', 'ROC-AUC-OVR', 'ACCURACY']

class_metrics = [calc_class_metrics(lr_metrics, 'LogReg'),
                 calc_class_metrics(boost_metrics, 'Boosting')]


pd.DataFrame(class_metrics, columns=cols_names2)

Unnamed: 0,METHOD,ROC-AUC-OVO,ROC-AUC-OVR,ACCURACY
0,LogReg,0.638456,0.651659,0.509835
1,Boosting,0.686947,0.706533,0.552705
