# Overview
Try to analyze and manipulate the data from _imports-85.data_ (CSV) and *train_data.json* to improve my data-munging skills in __*python*__ and __*Spark*__.

## Goal 1 - Find out how the risk breaks down by engine

Engine type is the fifth column and has values; `std` and `turbo`

In [16]:
rdd = spark.sparkContext.textFile('../data/imports-85.data').map(lambda line: line.split(','))
print rdd.count()
print rdd.take(2)

205
[[u'3', u'?', u'alfa-romero', u'gas', u'std', u'two', u'convertible', u'rwd', u'front', u'88.60', u'168.80', u'64.10', u'48.80', u'2548', u'dohc', u'four', u'130', u'mpfi', u'3.47', u'2.68', u'9.00', u'111', u'5000', u'21', u'27', u'13495'], [u'3', u'?', u'alfa-romero', u'gas', u'std', u'two', u'convertible', u'rwd', u'front', u'88.60', u'168.80', u'64.10', u'48.80', u'2548', u'dohc', u'four', u'130', u'mpfi', u'3.47', u'2.68', u'9.00', u'111', u'5000', u'21', u'27', u'16500']]


In [2]:
groupData = rdd.map(lambda line: ((line[0], line[4]), 1))
groupData.take(2)

[((u'3', u'std'), 1), ((u'3', u'std'), 1)]

In [7]:
groupByRiskAndEngine = groupData.reduceByKey(lambda x, y: x+y)
groupByRiskAndEngine.cache()
groupByRiskAndEngine.take(2)

[((u'3', u'std'), 19), ((u'0', u'turbo'), 11)]

In [15]:
print groupByRiskAndEngine.sortBy(lambda pair: int(pair[1]), False).collect()

[((u'0', u'std'), 56), ((u'1', u'std'), 47), ((u'2', u'std'), 30), ((u'3', u'std'), 19), ((u'-1', u'std'), 14), ((u'0', u'turbo'), 11), ((u'-1', u'turbo'), 8), ((u'3', u'turbo'), 8), ((u'1', u'turbo'), 7), ((u'2', u'turbo'), 2), ((u'-2', u'std'), 2), ((u'-2', u'turbo'), 1)]


In [21]:
stdRiskAndEngine = groupByRiskAndEngine.filter(lambda pair: pair[0][1]=='std')
turboRiskAndEngine = groupByRiskAndEngine.filter(lambda pair: pair[0][1]=='turbo')

stdRiskAndEngine.cache()
turboRiskAndEngine.cache()

print('Sorted Risk for Standard Engines: \n%s\n' % (stdRiskAndEngine.sortBy(lambda pair: int(pair[1]), False).collect()))

print('Sorted Risk for Turbo Engines: \n%s' % (turboRiskAndEngine.sortBy(lambda pair: int(pair[1]), False).collect()))


Sorted Risk for Standard Engines: 
[((u'0', u'std'), 56), ((u'1', u'std'), 47), ((u'2', u'std'), 30), ((u'3', u'std'), 19), ((u'-1', u'std'), 14), ((u'-2', u'std'), 2)]

Sorted Risk for Standard Engines: 
[((u'0', u'turbo'), 11), ((u'-1', u'turbo'), 8), ((u'3', u'turbo'), 8), ((u'1', u'turbo'), 7), ((u'2', u'turbo'), 2), ((u'-2', u'turbo'), 1)]


In [26]:
stdRisk = stdRiskAndEngine.map(lambda pair: int(pair[0][0]) * pair[1]).reduce(lambda x,y:x+y)
stdCount = stdRiskAndEngine.map(lambda pair: int(pair[1])).reduce(lambda x,y:x+y)
stdRiskWeighted = float(stdRisk)/float(stdCount)

turboRisk = turboRiskAndEngine.map(lambda pair: int(pair[0][0]) * pair[1]).reduce(lambda x,y:x+y)
turboCount = turboRiskAndEngine.map(lambda pair: int(pair[1])).reduce(lambda x,y:x+y)
turboRiskWeighted = float(turboRisk)/float(turboCount)

print('The std cummulative risk is: %d and the turbo cummulative risk is: %d' % (stdRisk, turboRisk))
print('The std weighted risk is (%d): %f and the turbo weighted risk is(%d): %f' % (stdCount, stdRiskWeighted, turboCount, turboRiskWeighted))

The std cummulative risk is: 146 and the turbo cummulative risk is: 25
The std weighted risk is (168): 0.869048 and the turbo weighted risk is(37): 0.675676


## Goal 2 - Evaluate Cross Val Model

In [28]:
import cPickle
import json
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from score_auto_gbm.FeatureTransformer import FeatureTransformer

In [30]:
with open("./score_auto_gbm/gbmFit.pkl", "rb") as pickle_file:
    gbmFit = cPickle.load(pickle_file)

In [37]:
data = pd.read_csv('../data/imports-85.data', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [40]:
trainData = data.ix[:,1:]
trainData.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,16,17,18,19,20,21,22,23,24,25
0,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [41]:
for idx, line in enumerate(trainData):
    print('Prediction: %s, Actual: %s.\n' % (list(gbmFit.predict(pd.DataFrame([line])))[0], data.ix[idx,0]))
    if idx > 5:
        break


ValueError: labels [u'bore' u'cityMPG' u'compressionRatio' u'curbWeight' u'engineSize'
 u'height' u'highwayMPG' u'horsepower' u'length' u'peakRPM' u'price'
 u'stroke' u'wheelBase' u'width'] not contained in axis