# Problem Set #2
- MACS 30250, Dr. Evans 
- Name: Kento Yoshizawa (CNET: kyoshizawa) 
- Date: May 11, 2020

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/Auto.csv',na_values = '?')
df.dropna(inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [3]:
df_2 = df.assign(mpg_high = lambda df: [1 if i >= np.median(df.mpg) else 0 for i in df.mpg])
df_2 = pd.get_dummies(df_2, prefix='orgn', prefix_sep='', columns=['origin']).drop(columns='orgn3')

### (a)

In [4]:
y = df_2['mpg_high'].values
X = df_2.drop(columns=['mpg','name','mpg_high']).values

In [5]:
## Create boostraping resampled index
from sklearn.utils import resample

nx = X.shape[0]
ind = np.arange(nx)
n_draw = 100
bs = [resample(ind, replace=True) for i in range(n_draw)]
len(bs)

100

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import time

rng = 417
lr = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', n_jobs=1, max_iter = 10000)

MSE_serial = np.zeros(n_draw)
start = time.time()
for i, b in enumerate(bs):
    X_bs = X[b]
    y_bs = y[b]
    X_train, X_test, y_train, y_test = \
                train_test_split(X_bs, y_bs, test_size = 0.35, random_state = rng)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    MSE_serial[i] = ((y_test - y_pred)**2).mean()
end = time.time() - start

print('Elapsed time for serial computation: {0:.4f}'.format(end), 'sec')
print('Mean squared Error                 : {0:.6f}'.format(MSE_serial.mean()))

Elapsed time for serial computation: 5.0385 sec
Mean squared Error                 : 0.093841


### (b)

In [7]:
import multiprocessing

num_cores = multiprocessing.cpu_count()
print('Number of available cores is', num_cores)

Number of available cores is 4


In [8]:
def bootstrap_CV(b, X, y, lr, rng=None):
    """
    input
    ------
    b: 1d array
        bootstraping resampled indices
    X: ndarray
        dependent variables
    y: 1d array
        target variable
    lr: object
        regression (classifier) object
    rng: int or None
        random state
    
    return
    -----
    MSE_i: float
        mean squared error
    """
    
    X_bs = X[b]
    y_bs = y[b]
    X_train, X_test, y_train, y_test = \
                train_test_split(X_bs, y_bs, test_size = 0.35, random_state = rng)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    MSE_i = ((y_test - y_pred)**2).mean()
    
    return MSE_i

In [9]:
from dask import compute, delayed
import dask.multiprocessing

lr = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', n_jobs=1, max_iter = 10000)

lazy_values = []
start = time.time()
for b in bs:
    lazy_values.append(delayed(bootstrap_CV)(b, X, y, lr, rng))
MSE_para = compute(*lazy_values, scheduler=dask.multiprocessing.get, num_workers=num_cores)
end = time.time()-start

print('Elapsed time for serial computation: {0:.4f}'.format(end), 'sec')
print('Mean squared Error                 : {0:.6f}'.format(np.array(MSE_para).mean()))

Elapsed time for serial computation: 2.4686 sec
Mean squared Error                 : 0.093841
