# Overview
Try to analyze and manipulate the data from _imports-85.data_ (CSV) and *train_data.json* to improve my data-munging skills in __*python*__ and __*Spark*__.

## Goal 1 - Find out how the risk breaks down by engine

Engine type is the fifth column and has values; `std` and `turbo`

In [1]:
rdd = spark.sparkContext.textFile('../data/imports-85.data').map(lambda line: line.split(','))
print rdd.count()
print rdd.take(2)

205
[[u'3', u'?', u'alfa-romero', u'gas', u'std', u'two', u'convertible', u'rwd', u'front', u'88.60', u'168.80', u'64.10', u'48.80', u'2548', u'dohc', u'four', u'130', u'mpfi', u'3.47', u'2.68', u'9.00', u'111', u'5000', u'21', u'27', u'13495'], [u'3', u'?', u'alfa-romero', u'gas', u'std', u'two', u'convertible', u'rwd', u'front', u'88.60', u'168.80', u'64.10', u'48.80', u'2548', u'dohc', u'four', u'130', u'mpfi', u'3.47', u'2.68', u'9.00', u'111', u'5000', u'21', u'27', u'16500']]


In [2]:
groupData = rdd.map(lambda line: ((line[0], line[4]), 1))
groupData.take(2)

[((u'3', u'std'), 1), ((u'3', u'std'), 1)]

In [3]:
groupByRiskAndEngine = groupData.reduceByKey(lambda x, y: x+y)
groupByRiskAndEngine.cache()
groupByRiskAndEngine.take(2)

[((u'3', u'std'), 19), ((u'0', u'turbo'), 11)]

In [4]:
print groupByRiskAndEngine.sortBy(lambda pair: int(pair[1]), False).collect()

[((u'0', u'std'), 56), ((u'1', u'std'), 47), ((u'2', u'std'), 30), ((u'3', u'std'), 19), ((u'-1', u'std'), 14), ((u'0', u'turbo'), 11), ((u'-1', u'turbo'), 8), ((u'3', u'turbo'), 8), ((u'1', u'turbo'), 7), ((u'2', u'turbo'), 2), ((u'-2', u'std'), 2), ((u'-2', u'turbo'), 1)]


In [5]:
stdRiskAndEngine = groupByRiskAndEngine.filter(lambda pair: pair[0][1]=='std')
turboRiskAndEngine = groupByRiskAndEngine.filter(lambda pair: pair[0][1]=='turbo')

stdRiskAndEngine.cache()
turboRiskAndEngine.cache()

print('Sorted Risk for Standard Engines: \n%s\n' % (stdRiskAndEngine.sortBy(lambda pair: int(pair[1]), False).collect()))

print('Sorted Risk for Turbo Engines: \n%s' % (turboRiskAndEngine.sortBy(lambda pair: int(pair[1]), False).collect()))


Sorted Risk for Standard Engines: 
[((u'0', u'std'), 56), ((u'1', u'std'), 47), ((u'2', u'std'), 30), ((u'3', u'std'), 19), ((u'-1', u'std'), 14), ((u'-2', u'std'), 2)]

Sorted Risk for Turbo Engines: 
[((u'0', u'turbo'), 11), ((u'-1', u'turbo'), 8), ((u'3', u'turbo'), 8), ((u'1', u'turbo'), 7), ((u'2', u'turbo'), 2), ((u'-2', u'turbo'), 1)]


In [6]:
stdRisk = stdRiskAndEngine.map(lambda pair: int(pair[0][0]) * pair[1]).reduce(lambda x,y:x+y)
stdCount = stdRiskAndEngine.map(lambda pair: int(pair[1])).reduce(lambda x,y:x+y)
stdRiskWeighted = float(stdRisk)/float(stdCount)

turboRisk = turboRiskAndEngine.map(lambda pair: int(pair[0][0]) * pair[1]).reduce(lambda x,y:x+y)
turboCount = turboRiskAndEngine.map(lambda pair: int(pair[1])).reduce(lambda x,y:x+y)
turboRiskWeighted = float(turboRisk)/float(turboCount)

print('The std cummulative risk is: %d and the turbo cummulative risk is: %d' % (stdRisk, turboRisk))
print('The std weighted risk is (%d): %f and the turbo weighted risk is(%d): %f' % (stdCount, stdRiskWeighted, turboCount, turboRiskWeighted))

The std cummulative risk is: 146 and the turbo cummulative risk is: 25
The std weighted risk is (168): 0.869048 and the turbo weighted risk is(37): 0.675676


## Goal 2 - Evaluate Cross Val Model

In [12]:
import cPickle
import json
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from score_auto_gbm.FeatureTransformer import FeatureTransformer

In [13]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [14]:
with open("./score_auto_gbm/gbmFit.pkl", "rb") as pickle_file:
    gbmFit = cPickle.load(pickle_file)

In [15]:
data = pd.read_csv('../data/imports-85.data', header=None)
data.columns = [u'risk', u'normalizedLosses', u'make', u'fuelType', u'aspiration', u'numDoors',
       u'bodyStyle', u'driveWheels', u'engineLocation', u'wheelBase',
       u'length', u'width', u'height', u'curbWeight', u'engineType',
       u'numCylinders', u'engineSize', u'fuelSystem', u'bore', u'stroke',
       u'compressionRatio', u'horsepower', u'peakRPM', u'cityMPG', u'highwayMPG', u'price']
print data.index.size
data.head()

205


Unnamed: 0,risk,normalizedLosses,make,fuelType,aspiration,numDoors,bodyStyle,driveWheels,engineLocation,wheelBase,...,engineSize,fuelSystem,bore,stroke,compressionRatio,horsepower,peakRPM,cityMPG,highwayMPG,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [16]:
trainData = data.ix[:,2:]

trainData.head()

Unnamed: 0,make,fuelType,aspiration,numDoors,bodyStyle,driveWheels,engineLocation,wheelBase,length,width,...,engineSize,fuelSystem,bore,stroke,compressionRatio,horsepower,peakRPM,cityMPG,highwayMPG,price
0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [16]:
predictedActualTuple = []
for idx, row in enumerate(trainData.to_dict(orient='records')):
    try:
        prediction = list(gbmFit.predict(pd.DataFrame([row])))[0]
    except ValueError:
        print ('There was an issue predicting row: %d. The # of elements is: %d. Max size is: %d' % 
               (idx + 1, len(predictedActualTuple), trainData.index.size))
        continue
    
    if is_number(data.ix[idx,0]) == True and is_number(prediction) == True:
        predictedActualTuple.extend([(float(prediction), float(data.ix[idx,0]))])

numberPredictions = len(predictedActualTuple)
stdPrediction = sum(map(lambda errorTuple: errorTuple[1], predictedActualTuple))/numberPredictions
squaredError = map(lambda errorTuple: (errorTuple[0] - errorTuple[1])**2, predictedActualTuple)
stdSquaredError = map(lambda errorTuple: (stdPrediction - errorTuple[1])**2, predictedActualTuple)
rmse = (sum(squaredError)/numberPredictions)**0.5
std_rmse = (sum(stdSquaredError)/numberPredictions)**0.5
maxPred, maxAct = reduce(lambda x, y:  x if abs(x[0]-x[1]) > abs(y[0]-y[1]) else y, predictedActualTuple)

print ('\nThe population size was: %d.\nThe squaured error was: %f.\nRMSE: %f.\nMaxed squared error: %f.\nPredicted & Acutal for max diff: %f & %f.\nSTD RMSE: %f' 
       % (numberPredictions,sum(squaredError),rmse, max(squaredError), maxPred, maxAct, std_rmse))


There was an issue predicting row: 10. The # of elements is: 9. Max size is: 205
There was an issue predicting row: 45. The # of elements is: 43. Max size is: 205
There was an issue predicting row: 46. The # of elements is: 43. Max size is: 205
There was an issue predicting row: 56. The # of elements is: 52. Max size is: 205
There was an issue predicting row: 57. The # of elements is: 52. Max size is: 205
There was an issue predicting row: 58. The # of elements is: 52. Max size is: 205
There was an issue predicting row: 59. The # of elements is: 52. Max size is: 205
There was an issue predicting row: 130. The # of elements is: 122. Max size is: 205
There was an issue predicting row: 131. The # of elements is: 122. Max size is: 205
There was an issue predicting row: 132. The # of elements is: 122. Max size is: 205

The population size was: 195.
The squaured error was: 10.487928.
RMSE: 0.231914.
Maxed squared error: 3.999332.
Predicted & Acutal for max diff: 0.999833 & -1.000000.
STD RMS

In [2]:
test = []
test.extend([(0.99,1.00), (1.01,0.98)])
testSquareError = map(lambda x: (x[1] - x[0])**2,test)
print test
print testSquareError
print sum(testSquareError)
print round(float(reduce(lambda x,y: float(x+y), testSquareError)), 10)

[(0.99, 1.0), (1.01, 0.98)]
[0.00010000000000000018, 0.0009000000000000016]
0.001
0.001


In [3]:
# Print troubled rows. They are the ones with ? in them.
# Then why does
trainData.ix[[9,44,45,55,56,58,129,130,131],:]

NameError: name 'trainData' is not defined

In [14]:
for i in range(10):
    if i % 2 == 0:
        print('This is even.', i)
        continue
    
    print('This is not even', i)

('This is even.', 0)
('This is not even', 1)
('This is even.', 2)
('This is not even', 3)
('This is even.', 4)
('This is not even', 5)
('This is even.', 6)
('This is not even', 7)
('This is even.', 8)
('This is not even', 9)


Get the model from the pipeline

## Goal 3 - Use munging to explain the effectiveness 

#### Get model from the pipeline. Can use index or named_steps per [this article](http://stackoverflow.com/questions/28822756/getting-model-attributes-from-scikit-learn-pipeline)

In [24]:
gbm = trainData.columns,gbmFit.steps[1][1]

### Use __*feature importance*__ to analyze effectiveness.

In [23]:
importantFeatures = zip(gbm.feature_importances_)
importantFeatures.sort(key=lambda x:-x[1])
importantFeatures

[(u'engineType', 0.15571109118551155),
 (u'fuelType', 0.060118267903031167),
 (u'driveWheels', 0.058627068812926453),
 (u'curbWeight', 0.025824447064762068),
 (u'numDoors', 0.017810109459157425),
 (u'wheelBase', 0.013704405797728712),
 (u'make', 0.012053462066536333),
 (u'length', 0.006637071354084788),
 (u'width', 0.0056641482321591295),
 (u'aspiration', 0.00549890366837696),
 (u'height', 0.0048188934320857199),
 (u'numCylinders', 0.0043726511534183977),
 (u'stroke', 0.002740747864547402),
 (u'engineLocation', 0.0020267456757935942),
 (u'horsepower', 0.0013810985295654773),
 (u'bodyStyle', 0.0011872997888196213),
 (u'peakRPM', 0.00031835359842260824),
 (u'price', 0.00028688828332065297),
 (u'compressionRatio', 0.00011359836786870279),
 (u'engineSize', 0.0),
 (u'fuelSystem', 0.0),
 (u'bore', 0.0),
 (u'cityMPG', 0.0),
 (u'highwayMPG', 0.0)]

### Use  __*partial dependence*__ to analyze effectiveness.

### Use leave out flag to evaluate model on trained data