# Bootstrap solutions 

_Author: Christoph Rahmede_

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [2]:
df = pd.read_csv(
    '../../../dataset-collection/auto-mpg/auto-mpg.csv', sep='\s+', header=None)
df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower',
              'weight', 'acceleration', 'year', 'origin', 'name']
df.drop(df[df.horsepower == '?'].index, inplace=True)
df['horsepower'] = df.horsepower.astype(float)
df['brand'] = df.name.map(lambda x: x.split(' ')[0]
                          ).replace(
    {'toyouta': 'toyota',
     'maxda': 'mazda',
     'chevroelt': 'chevrolet',
     'vw': 'volkswagen',
     'vokswagen': 'volkswagen',
     'mercedes-benz': 'mercedes'})
# this time we reset the index - this is important
df.reset_index(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 11 columns):
index           392 non-null int64
mpg             392 non-null float64
cylinders       392 non-null int64
displacement    392 non-null float64
horsepower      392 non-null float64
weight          392 non-null float64
acceleration    392 non-null float64
year            392 non-null int64
origin          392 non-null int64
name            392 non-null object
brand           392 non-null object
dtypes: float64(5), int64(4), object(2)
memory usage: 33.8+ KB


In [3]:
import statsmodels.api as sm
from arch.bootstrap import IIDBootstrap

#### Write a new wrapper which allows to determine confidence intervals when using the Lasso

In [4]:
from sklearn.linear_model import Lasso
def lasso_wrap(endog, exog, alpha=1):
    model = Lasso(alpha=alpha)
    model.fit(exog, endog)

    return np.array([model.intercept_]+list(model.coef_.ravel())+[model.score(exog,endog)])

lasso_wrap(df['mpg'], df[['horsepower']])

array([39.86516803, -0.15716805,  0.60593712])

In [5]:
bs = IIDBootstrap(endog=df['mpg'], exog=df[['horsepower']])
bs.seed(10)
ci = bs.conf_int(lasso_wrap, 1000, method='percentile', extra_kwargs={'alpha':30})
ci = pd.DataFrame(ci, index=['Lower','Upper'])
ci

Unnamed: 0,0,1,2
Lower,36.310036,-0.150327,0.550093
Upper,39.353966,-0.12572,0.644238


#### Write a new wrapper which allows to use Logistic Regression

- Predict `origin` as the outcome variable. 
- You will have to use binary logistic regression to compare to statsmodels' `sm.Logit` (e.g. by setting `df['origin']==1`).
- If you want to make your wrapper work for multiple classes, you will have to flatten the array of coefficients.

In [6]:
results_1 = sm.Logit(df.origin==1,sm.add_constant(df[['mpg','horsepower']])).fit()
results_1.summary()

Optimization terminated successfully.
         Current function value: 0.467744
         Iterations 7


0,1,2,3
Dep. Variable:,origin,No. Observations:,392.0
Model:,Logit,Df Residuals:,389.0
Method:,MLE,Df Model:,2.0
Date:,"Sun, 10 Feb 2019",Pseudo R-squ.:,0.293
Time:,20:44:54,Log-Likelihood:,-183.36
converged:,True,LL-Null:,-259.33
,,LLR p-value:,1.0079999999999999e-33

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.2542,1.284,0.977,0.329,-1.262,3.770
mpg,-0.1250,0.028,-4.459,0.000,-0.180,-0.070
horsepower,0.0244,0.008,3.175,0.001,0.009,0.040


In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
def logreg_wrap(endog, exog):
    model = LogisticRegression(C=10**10)
    model.fit(exog, endog)
    return np.array(list(model.intercept_)+list(model.coef_.ravel())+[model.score(exog,endog)])

Binary classification:

In [9]:
bs = IIDBootstrap(endog=df['origin']==1, exog=df[['mpg','horsepower']])
bs.seed(10)
ci = bs.conf_int(logreg_wrap, 1000, method='percentile')
ci = pd.DataFrame(ci, index=['Lower','Upper'])
ci

Unnamed: 0,0,1,2,3
Lower,-1.0935,-0.186748,0.012917,0.716837
Upper,3.657869,-0.076508,0.038427,0.806122


Multiclass classification:

In [10]:
bs = IIDBootstrap(endog=df['origin'], exog=df[['mpg','horsepower']])
bs.seed(10)
ci = bs.conf_int(logreg_wrap, 1000, method='percentile')
ci = pd.DataFrame(ci, index=['Lower','Upper'])
ci

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Lower,-1.0935,0.000221,-6.804149,-0.186748,0.012917,-0.065109,-0.056782,0.090394,-0.020255,0.647895
Upper,3.657869,4.884507,-2.188779,-0.076508,0.038427,0.032687,-0.022628,0.202965,0.003759,0.742411
