# Boosting Model for Predicting Car Risk
Got the code from [Gradient Boosting Regressor](http://docs.opendatagroup.com/docs/example-gradient-boosting-regressor)

In [1]:
import cPickle
import json
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from score_auto_gbm.FeatureTransformer import FeatureTransformer

In [2]:
in_data = pd.read_json("../data/train_data.json", orient = "records")
X = in_data.drop("risk", 1)
y = np.array(in_data["risk"])

In [3]:
print in_data.columns
print in_data.head()

Index([u'aspiration', u'bodyStyle', u'bore', u'cityMPG', u'compressionRatio',
       u'curbWeight', u'driveWheels', u'engineLocation', u'engineSize',
       u'engineType', u'fuelSystem', u'fuelType', u'height', u'highwayMPG',
       u'horsepower', u'length', u'make', u'numCylinders', u'numDoors',
       u'peakRPM', u'price', u'risk', u'stroke', u'wheelBase', u'width'],
      dtype='object')
  aspiration    bodyStyle  bore  cityMPG  compressionRatio  curbWeight  \
0        std  convertible  3.47       21               9.0        2548   
1        std  convertible  3.47       21               9.0        2548   
2        std    hatchback  2.68       19               9.0        2823   
3        std        sedan  3.19       18               8.0        2824   
4        std        sedan  3.19       19               8.5        2507   

  driveWheels engineLocation  engineSize engineType  ...   length  \
0         rwd          front         130       dohc  ...    168.8   
1         rwd          

In [5]:
in_data.select("fuelSystem", "fuelType").head()

ValueError: No axis named fuelType for object type <class 'pandas.core.frame.DataFrame'>

### Train model

In [4]:
preprocess = FeatureTransformer()

gbm = GradientBoostingRegressor(learning_rate = 0.1, 
                               random_state = 1234)

pipe = Pipeline([("preprocess", preprocess), ("gbm", gbm)])

gbm_cv = GridSearchCV(pipe,
                     dict(gbm__n_estimators = [50, 100, 150, 200],
                         gbm__max_depth = [5, 6, 7, 8, 9, 10]),
                     cv = 5,
                     scoring = make_scorer(mean_squared_error),
                     verbose = 100)
gbm_cv.fit(X, y)

gbmFit = gbm_cv.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] gbm__max_depth=5, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=5, gbm__n_estimators=50, score=1.390660, total=   0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[CV] gbm__max_depth=5, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=5, gbm__n_estimators=50, score=0.884076, total=   0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[CV] gbm__max_depth=5, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=5, gbm__n_estimators=50, score=0.498074, total=   0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s
[CV] gbm__max_depth=5, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=5, gbm__n_estimators=50, score=0.800251, total=   0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[CV] gbm__max_depth=5, gbm__n_

## Predictions

In [5]:
def score(record):
    datum = json.loads(record)
    score = list(gbmFit.predict(pd.DataFrame([datum])))[0]
    return json.dumps(score)

In [8]:
with open("../data/gbm_input_data_multiline.json", "rb") as f:
    for line in f:
        print score(line)

0.78488308928032324
0.99978623622995488
0.14520131926120264
0.00047602695754688152
0.99991458040281866
0.99991458040281866
1.9995876660931677
1.3224157902700489
1.0001180249460448
1.1177808900433042
0.0001946819176505984
3.0932028320970658e-05
0.00084216500537457012
1.9995378874905989
1.9995378874905989
1.1850050689999387
1.763893343895927
0.99983306050542853
1.0001046276155432
0.99973584636241164
0.00024021999504945344
7.3299588672116753e-05
2.9996514167995838
-0.52638308075100781
0.016724337306485742
0.016724337306485742
-2.8204923308308159e-05
-0.00010477766387946928
1.0002100842451624
1.1850050689999387
2.99962270583951
2.9996243078606204
1.9996073670387611
0.00029127440377646317
0.0001946819176505984
0.99999070863700634
0.00025611299138624243
0.00028333394568963352
0.00022805533266072081
1.9998561788680871
-0.99962152697975848
-0.99962152697975848
-0.99962152697975848
1.9996012026719876
0.00017328946984374494
-0.99980159382466594
-0.99980159382466594
