In [1]:
from lib.project_5 import load_data_from_database, make_data_dict, general_model, general_transformer

# Step 3 - Build Model

**NOTE: EACH OF THESE SHOULD BE WRITTEN SOLELY WITH REGARD TO STEP 3 - Build Model**

### Domain and Data

MADELON is an artificial dataset, which was created for a feature selection challenge. The difficulty is that this dataset has 500 features and they are highly non-linear.

Instances: 2000
Features: 500

### Problem Statement

We would like to build the best model with with greatest accuracy. 

### Solution Statement

We optimize the process using Pipeline and GridSearch to find optimal parameters for our model. 

### Metric

We will use the accuracy score as our metric to identify optimal parameters for our model.

### Benchmark

Our benchmark accuracy is 0.85 using SelectKBest and KNeighborsClassifier in a pipeline. 

## Implementation

Implement the following code pipeline using the functions you write in `lib/project_5.py`.

<img src="assets/build_model.png" width="600px">

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [3]:
df = load_data_from_database()

In [4]:
data_dict = make_data_dict(df, random_state=32)

In [5]:
data_dict = general_transformer(StandardScaler(), data_dict)

In [6]:
data_dict = general_transformer(SelectKBest(), data_dict, 32)

In [7]:
data_dict = general_model(KNeighborsClassifier(), data_dict, 32)

In [8]:
data_dict = general_model(LogisticRegression(), data_dict, 32)

In [9]:
data_dict['models']

[StandardScaler(copy=True, with_mean=True, with_std=True),
 SelectKBest(k=10, score_func=<function f_classif at 0x1153b4938>),
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=1, n_neighbors=5, p=2,
            weights='uniform'),
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)]

In [9]:
knn = data_dict['models'][2]

In [10]:
params = dict(n_neighbors=range(1,21),
              weights=['uniform','distance'])

In [11]:
knn_clf = GridSearchCV(knn, param_grid=params)

In [12]:
grid = general_model(knn_clf, data_dict, 32)

In [13]:
knn_model = grid['models'][4]

In [14]:
knn_model.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='distance')

In [15]:
knn_model.best_params_

{'n_neighbors': 4, 'weights': 'distance'}

In [16]:
knn_model.best_score_

0.85199999999999998

In [17]:
lr = data_dict['models'][3]

In [18]:
params = dict(C=[0.01,0.02,0.03,0.04,0.05],
              penalty=['l1','l2'])

In [19]:
lr_clf = GridSearchCV(lr, param_grid=params)

In [20]:
grid = general_model(lr_clf, data_dict, 32)

In [21]:
lr_model = grid['models'][5]

In [22]:
lr_model.best_estimator_

LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
lr_model.best_params_

{'C': 0.02, 'penalty': 'l1'}

In [24]:
lr_model.best_score_

0.61066666666666669

In [26]:
pd.DataFrame(knn_model.cv_results_).head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_n_neighbors,param_weights,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.003375,0.006592,0.836667,1.0,1,uniform,"{u'n_neighbors': 1, u'weights': u'uniform'}",12,0.838,1.0,0.84,1.0,0.832,1.0,0.001337,0.000977,0.003399,0.0
1,0.002475,0.005405,0.836667,1.0,1,distance,"{u'n_neighbors': 1, u'weights': u'distance'}",12,0.838,1.0,0.84,1.0,0.832,1.0,0.000518,0.000486,0.003399,0.0
2,0.00211,0.005376,0.814,0.912667,2,uniform,"{u'n_neighbors': 2, u'weights': u'uniform'}",33,0.828,0.907,0.81,0.914,0.804,0.917,2.6e-05,6.2e-05,0.010198,0.00419
3,0.002779,0.006057,0.836667,1.0,2,distance,"{u'n_neighbors': 2, u'weights': u'distance'}",12,0.838,1.0,0.84,1.0,0.832,1.0,0.00087,0.00086,0.003399,0.0
4,0.002277,0.007496,0.846667,0.925333,3,uniform,"{u'n_neighbors': 3, u'weights': u'uniform'}",5,0.842,0.923,0.866,0.919,0.832,0.934,0.000202,0.001942,0.014267,0.006342
