In [82]:
import numpy as np
from copy import copy
import pandas as pd
from tqdm.auto import tqdm

from scipy.integrate import quad
from scipy.stats import genextreme

from statsmodels.discrete.discrete_model import MNLogit
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from statsmodels.regression.linear_model import OLS
from sklearn.neighbors import KNeighborsRegressor

from catboost import CatBoostClassifier

from sklearn.model_selection import cross_validate, LeaveOneOut
from sklearn.metrics import roc_auc_score, accuracy_score, mean_squared_error

from IPython.display import clear_output
import matplotlib.pyplot as plt 

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = 12, 8

## Генерация данных

In [2]:
beta_spend = np.array([1, 0.1, 0.0001, 1])
betas = np.array([[0.1, 0.000025, 0.3, 0.01],
                  [0.2, 0.000015, 0.2, 0.015],
                  [3, -0.00002, 0.5, -0.02]])

In [3]:
def generate_sample(n, betas=betas, beta_spend=beta_spend, verbose=False):
    
    ### Регрессоры
    x0 = np.ones(shape=n)
    x1 = np.exp(np.random.normal(loc=10, scale=0.7, size=n))
    x2 = np.random.poisson(lam=3, size=n)
    x2[x2 > 5] = 5
    x3 = np.round(np.random.uniform(low=20, high=100, size=n))
    x4 = np.random.poisson(lam=3, size=n)
    x4[x4 > 5] = 5
    df = pd.DataFrame(zip(x0, x1, x2, x3, x4),
                      columns=['const', 'income', 'health', 'age', 'drive'])
    
    ### Линейные индексы
    eps = genextreme.rvs(c=0, size=(n, 3))
    y_li = df[['const', 'income', 'health', 'age']] @ betas.T
    df[['y_star_Car', 'y_star_Taxi', 'y_star_Public']] = y_li + eps
    df['transport'] = np.argmax(np.array(df[['y_star_Car', 'y_star_Taxi', 'y_star_Public']]), axis=1)

    if verbose:
        print(df.transport.value_counts())
    
    ### Расходы
    rho = np.array([0.64, -0.25, 0.14])
    mevd, __ = quad(lambda x: genextreme.pdf(x, c=0) * x, -100, 100)
    m2evd, __ = quad(lambda x: genextreme.pdf(x, c=0) * x ** 2, -100, 100)
    vevd = m2evd - mevd ** 2
    vevd_rho = np.sum(vevd * rho ** 2)
    adj = np.sqrt(6) / np.pi
    sigma = 4
    
    eps_spend = sigma * adj * (eps - mevd) @ rho + \
                np.random.normal(size=n, loc=0, scale=np.sqrt(sigma ** 2 - (sigma * adj * np.sqrt(vevd_rho) ** 2)))

    spend_li = df[['const', 'age', 'income', 'drive']] @ beta_spend.T
    df['spend'] = spend_li + eps_spend
    
    df.loc[df['transport'] != 0, 'spend'] = np.nan
    
    return df

## BASS

In [194]:
class BassBoost():
    
    def __init__(self, max_depth, eta=1, max_iter=5, verbose=1):
        self.trees = []
        self.max_depth = max_depth
        self.max_iter = max_iter
        self.eta = eta
        self.verbose = verbose
        self.loss_history = []
        
    def bias_pred(self, X, y):
        pred = np.zeros(X.shape[0])
        
        for tree in range(len(self.trees)):
            pred += tree.predict(X)
        
        return pred
    
    def __calc_grad__(self, y, pred):
        return 2 * (pred - y)
         
    def fit(self, X_index, X_bias, y, init_betas=None):
        target = y.copy()
        
        
        for i in tqdm(range(self.max_iter)):
            if not i:
                if init_betas == None:
                    linear = LinearRegression().fit(X_index, target)
                    self.coef_ = linear.coef_
                    self.intercept_ = linear.intercept_
                
                else:
                    self.intercept_ = init_betas[0]
                    self.coef_ = init_betas[1:]
                    
            if i % 3 == 0:
                
                linear = LinearRegression().fit(X_index, target)
                self.coef_ = linear.coef_
                self.intercept_ = linear.intercept_
                
            error = mean_squared_error(target, (X_index @ self.coef_ + self.intercept_))

            self.loss_history.append(error)
            if not i % self.verbose:
                print(error)
            
                
            grads = self.__calc_grad__(y, X_index @ self.coef_ + self.intercept_)
#             #grads = y - X_index @ self.betas - self.intercept

            tree = DecisionTreeRegressor(max_depth=self.max_depth).fit(X_bias, grads)
            self.trees.append(copy(tree))
            target -= self.eta * tree.predict(X_bias)
                
#             tree = DecisionTreeRegressor(max_depth=self.max_depth)
#             tree.fit(X_bias, target - (X_index @ self.coef_ + self.intercept_))
#             self.trees.append(copy(tree))
#             target -= tree.predict(X_bias)
            
        return 
        
        
#         for i in range(self.max_iter):
#             if not i:
#                 if init_betas == None:
#                     linear = LinearRegression().fit(X_index, target)
#                     self.coef_ = linear.coef_
#                     self.intercept_ = linear.intercept_
                
#                 else:
#                     self.intercept_ = init_betas[0]
#                     self.coef_ = init_betas[1:]
                    
#             if i:
#                 linear = LinearRegression().fit(X_index, target)
#                 self.coef_ = linear.coef_
#                 self.intercept_ = linear.intercept_
#             if not i % self.verbose:
#                 print(mean_squared_error(target, (X_index @ self.coef_ + self.intercept_)))
            
#             tree = DecisionTreeRegressor(max_depth=self.max_depth)
#             tree.fit(X_bias, target - (X_index @ self.coef_ + self.intercept_))
#             self.trees.append(copy(tree))
#             target -= tree.predict(X_bias)
            

        
            
        return 
        
    def predict(self, X, y, decompose=False):
        
        pass
        

## Основные функции

In [195]:
def simulate(n_simulations, n_size):
    
    ols_coefs = []
    dmf_coefs = []
    lr_metrics = []
    boost_metrics = []
    ml_coefs = []

    for i in tqdm(range(n_simulations)):
        
        ### Генерируем данные
        df = generate_sample(n_size)
        X, y = df[['const', 'income', 'health', 'age']], df['transport']
        df_no_nans = df.dropna()
        X_spend = df_no_nans[['age', 'income', 'drive']]
        y_spend = df_no_nans['spend']
        
        ### МНК
        lm = LinearRegression().fit(X_spend, y_spend)
        ols_coefs.append([lm.intercept_] + list(lm.coef_))
        
        ### DMF
        ## Вероятности
        lr = MNLogit(y, X).fit(disp=0)
        prob_lr = lr.predict()
        pred_lr = np.argmax(prob_lr, axis=1)
        scores_lr = [roc_auc_score(y, prob_lr, multi_class='ovo'),
                  roc_auc_score(y, prob_lr, multi_class='ovr'),
                  accuracy_score(y, pred_lr)]
        lr_metrics.append(scores_lr)
        
        ## Лямбды
        df['lambda1'] = -np.log(prob_lr[:, 0])
        df['lambda2'] = prob_lr[:, 1] * np.log(prob_lr[:, 1]) / (1 - prob_lr[:, 1])
        df['lambda3'] = prob_lr[:, 2] * np.log(prob_lr[:, 2]) / (1 - prob_lr[:, 2])
        df_no_nans = df.dropna()
        X_spend = df_no_nans[['age', 'income', 'drive']]
        y_spend = df_no_nans['spend']
    
        ## Линейная модель
        dmf = LinearRegression().fit(df_no_nans[['age', 'income', 'drive', 
                                                 'lambda1', 'lambda2', 'lambda3']], y_spend)
        dmf_coefs.append([dmf.intercept_] + list(dmf.coef_[:3]))
        
        ### BASS
        ## Вероятности
        preds_spend = dmf.predict(df[['age', 'income', 'drive', 
                                      'lambda1', 'lambda2', 'lambda3']])
        df['spend_pred'] = preds_spend
        X_index = df.dropna()[['income', 'health', 'age']]
        X_spend = df.dropna()[['age', 'income', 'drive', 'spend_pred']]
        
        bass = BassBoost(max_depth=3, max_iter=100, eta=1)
        bass.fit(X_index, X_spend, y_spend, [dmf.intercept_] + list(dmf.coef_[:3]))
    
        ml_coefs.append([bass.intercept_] + list(bass.coef_[:3]))
        
    return ols_coefs, dmf_coefs, ml_coefs
        

In [196]:
def calc_coef_metrics(ests, true, model_name):
    return [model_name] + list(np.sqrt(np.mean((np.array(ests) - true) ** 2, axis=0))) + \
           list(100 * np.mean(np.abs(np.array(ests) - true) / true, axis=0))


## Симуляции 1.

### Выборка размером 1 000

In [197]:
np.random.seed(3)

n_simulations = 1
n_size = 1000

ols_coefs, dmf_coefs, ml_coefs = simulate(n_simulations, n_size)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

23.23416050807847
58.07887079127623
127.76829135767173
231.9972444891345
362.7156683354049
529.5261650072313
725.9651690882814
848.5956351590341
1029.7170772749798
1161.5112687850362
1538.5311763905352
2461.7658677711306
1776.7985448026232
7436.008217337597
23887.604227248543
6251.405717759442
119515.40248937331
461948.02845615975
36247.038372647745
2729919.1711615976
10797865.565559616
472593.1176166971
63631839.49214006
255681970.87547302
6480147.7728392035
1526597394.4886124
6138781040.991557
165418366.0951123
36608531524.184265
147229699275.6887
3905727019.4981337
878287228612.9738
3532185429321.855
93990492526.34181
21069647470626.2
84735554280623.28
2253364717399.675
505458024843572.4
2032793243351182.8
54064849336324.53
1.2125830115426756e+16
4.876628272208867e+16
1296970233608457.2
2.908962736528618e+17
1.1698934743409329e+18
3.111422650923309e+16
6.978543281743145e+18
2.8065510053802566e+19
7.464232410540224e+17
1.6741385890883132e+20
6.732859774775165e+20
1.790654839620383e+1

In [193]:
cols_names = ['METHOD', 'RMSE0', 'RMSE1', 'RMSE2', 'RMSE3', 'MAPE0', 'MAPE1', 'MAPE2', 'MAPE3']

results = [calc_coef_metrics(ols_coefs, beta_spend, model_name='OLS'),
           calc_coef_metrics(dmf_coefs, beta_spend, model_name='DMF'),
           calc_coef_metrics(ml_coefs, beta_spend, model_name='BASS')]

pd.DataFrame(results, columns=cols_names)

Unnamed: 0,METHOD,RMSE0,RMSE1,RMSE2,RMSE3,MAPE0,MAPE1,MAPE2,MAPE3
0,OLS,3.492896,0.006219565,1.188846e-05,0.03462181,349.2896,6.219565,11.88846,3.462181
1,DMF,9.228293,0.01873972,4.80098e-05,0.0162606,922.8293,18.73972,48.0098,1.62606
2,BASS,2.531274e+17,25607790000.0,571558700000000.0,4064190000000000.0,2.531274e+19,25607790000000.0,5.715587e+20,4.06419e+17


### Выборка размером 10 000

In [132]:
np.random.seed(3)

n_simulations = 1
n_size = 10000

ols_coefs, dmf_coefs, ml_coefs = simulate(n_simulations, n_size)

  0%|          | 0/1 [00:00<?, ?it/s]

23952959.055271786
361590.1787034595
212509.4050375413
165642.89879088578
98096.67060454527
63753.79733031647
37129.326440278885
25612.23768897432
18533.78132703617
16123.78235263978
12757.988368312737
10543.694658306757
9630.057484906669
8568.961085850542
8005.138106157541
6057.154875652471
5340.14660489373
4746.136389711148
3759.905254523693
3213.146811033765
3064.397958217416
3005.4352595894097
2762.5088805966816
2294.3603557741712
2110.426254543493
2035.661333600095
1834.7529596726351
1639.6349914097864
1564.0633164629974
1498.160410840132
1407.0256252682077
1251.0268956372802
1175.2143869396102
1140.6822784344754
1094.064514543274
1070.0172451774117
1016.8414004031572
929.7667892231432
878.8763472497992
844.3619800344889
835.8916993942908
800.2925109486589
765.5320211860064
758.661273309409
694.1655896964676
682.9813488747782
654.4549237188451
648.4617397528784
625.4324071665123
605.3311567823849
566.3480810446276
556.8373324667301
542.5005578745845
481.38354347230666
427.22924527

5.0106698281967175
4.9859681226890284
4.971550984546776
4.946154139658973
4.926137825445372
4.876416481294153
4.85556456940035
4.828374712270045
4.801568922372334
4.772805617037382
4.752352333838986
4.730013291235135
4.711724504182394
4.6906195102717145
4.676826329383169
4.6442737201338895
4.595267367901204
4.586481890345459
4.562906146464639
4.548559464916731
4.542701000959307
4.510057582295091
4.504170982691798
4.482871577847879
4.45655471247177
4.445321233209644
4.429847892929417
4.388042569889507
4.376274993662487
4.342098493197789
4.314510920252288
4.290619314976464
4.231696283833291
4.201139885625077
4.198754058838119
4.146023441315081
4.12156376015856
4.098913822774884
4.078236574344805
4.03761687573663
4.028729015103053
4.01990036871231
4.010679423001423
3.982085858170517
3.9603730280946903
3.953055932699268
3.9340026869444804
3.9262822625576894
3.919009242203367
3.9113663640353753
3.8974811285891837
3.8838608510613724
3.86972606274663
3.8430133737702414
3.8400281347044447
3.83

0.30818206468818116
0.3073857865526061
0.30676436437229276
0.3052477654132282
0.30446501569965195
0.3039345286166997
0.30298111111384946
0.3026425538042263
0.3013729718215762
0.301013050961043
0.30051439263202895
0.30021792130364416
0.2989519204087496
0.29802305599636214
0.29705958650050296
0.2945698656764357
0.2938293894604714
0.2920703140616632
0.2912951088061495
0.2896750046380806
0.28839351028796784
0.2863620092409762
0.28452103464834133
0.28316079412824646
0.2808465082221576
0.2776414658484292
0.2735352412455725
0.27110460081187987
0.2701768388328059
0.2686191353745628
0.26622897760237746
0.2652210751002585
0.26461707025557535
0.2625270842567233
0.2622401980228001
0.2596062158733721
0.2592840410949749
0.2585221858664909
0.2583897036511304
0.25746232938199987
0.2546227171143436
0.25330310841036396
0.252192101446381
0.2514560052040412
0.25069291370955293
0.2495079987924237
0.2480526949876382
0.24759488446227723
0.24733455532098805
0.24588747864701693
0.24400956173145058
0.2427017654

In [133]:
cols_names = ['METHOD', 'RMSE0', 'RMSE1', 'RMSE2', 'RMSE3', 'MAPE0', 'MAPE1', 'MAPE2', 'MAPE3']

results = [calc_coef_metrics(ols_coefs, beta_spend, model_name='OLS'),
           calc_coef_metrics(dmf_coefs, beta_spend, model_name='DMF'),
           calc_coef_metrics(ml_coefs, beta_spend, model_name='BASS')]

pd.DataFrame(results, columns=cols_names)

Unnamed: 0,METHOD,RMSE0,RMSE1,RMSE2,RMSE3,MAPE0,MAPE1,MAPE2,MAPE3
0,OLS,4.000621,0.0139,1.6e-05,0.01904,400.062109,13.900242,16.252758,1.904032
1,DMF,2.055969,0.001703,2e-06,0.019823,205.59691,1.702765,1.949519,1.982277
2,BASS,246.457607,0.003246,0.380868,0.948519,24645.760665,3.246392,380867.665621,94.851889
