# Overview
Try to analyze and manipulate the data from _imports-85.data_ (CSV) and *train_data.json* to improve my data-munging skills in __*python*__ and __*Spark*__.

## Goal 1 - Find out how the risk breaks down by engine

Engine type is the fifth column and has values; `std` and `turbo`

In [1]:
rdd = spark.sparkContext.textFile('../data/imports-85.data').map(lambda line: line.split(','))
print rdd.count()
print rdd.take(2)

205
[[u'3', u'?', u'alfa-romero', u'gas', u'std', u'two', u'convertible', u'rwd', u'front', u'88.60', u'168.80', u'64.10', u'48.80', u'2548', u'dohc', u'four', u'130', u'mpfi', u'3.47', u'2.68', u'9.00', u'111', u'5000', u'21', u'27', u'13495'], [u'3', u'?', u'alfa-romero', u'gas', u'std', u'two', u'convertible', u'rwd', u'front', u'88.60', u'168.80', u'64.10', u'48.80', u'2548', u'dohc', u'four', u'130', u'mpfi', u'3.47', u'2.68', u'9.00', u'111', u'5000', u'21', u'27', u'16500']]


In [2]:
groupData = rdd.map(lambda line: ((line[0], line[4]), 1))
groupData.take(2)

[((u'3', u'std'), 1), ((u'3', u'std'), 1)]

In [3]:
groupByRiskAndEngine = groupData.reduceByKey(lambda x, y: x+y)
groupByRiskAndEngine.cache()
groupByRiskAndEngine.take(2)

[((u'3', u'std'), 19), ((u'0', u'turbo'), 11)]

In [4]:
print groupByRiskAndEngine.sortBy(lambda pair: int(pair[1]), False).collect()

[((u'0', u'std'), 56), ((u'1', u'std'), 47), ((u'2', u'std'), 30), ((u'3', u'std'), 19), ((u'-1', u'std'), 14), ((u'0', u'turbo'), 11), ((u'-1', u'turbo'), 8), ((u'3', u'turbo'), 8), ((u'1', u'turbo'), 7), ((u'2', u'turbo'), 2), ((u'-2', u'std'), 2), ((u'-2', u'turbo'), 1)]


In [5]:
stdRiskAndEngine = groupByRiskAndEngine.filter(lambda pair: pair[0][1]=='std')
turboRiskAndEngine = groupByRiskAndEngine.filter(lambda pair: pair[0][1]=='turbo')

stdRiskAndEngine.cache()
turboRiskAndEngine.cache()

print('Sorted Risk for Standard Engines: \n%s\n' % (stdRiskAndEngine.sortBy(lambda pair: int(pair[1]), False).collect()))

print('Sorted Risk for Turbo Engines: \n%s' % (turboRiskAndEngine.sortBy(lambda pair: int(pair[1]), False).collect()))


Sorted Risk for Standard Engines: 
[((u'0', u'std'), 56), ((u'1', u'std'), 47), ((u'2', u'std'), 30), ((u'3', u'std'), 19), ((u'-1', u'std'), 14), ((u'-2', u'std'), 2)]

Sorted Risk for Turbo Engines: 
[((u'0', u'turbo'), 11), ((u'-1', u'turbo'), 8), ((u'3', u'turbo'), 8), ((u'1', u'turbo'), 7), ((u'2', u'turbo'), 2), ((u'-2', u'turbo'), 1)]


In [6]:
stdRisk = stdRiskAndEngine.map(lambda pair: int(pair[0][0]) * pair[1]).reduce(lambda x,y:x+y)
stdCount = stdRiskAndEngine.map(lambda pair: int(pair[1])).reduce(lambda x,y:x+y)
stdRiskWeighted = float(stdRisk)/float(stdCount)

turboRisk = turboRiskAndEngine.map(lambda pair: int(pair[0][0]) * pair[1]).reduce(lambda x,y:x+y)
turboCount = turboRiskAndEngine.map(lambda pair: int(pair[1])).reduce(lambda x,y:x+y)
turboRiskWeighted = float(turboRisk)/float(turboCount)

print('The std cummulative risk is: %d and the turbo cummulative risk is: %d' % (stdRisk, turboRisk))
print('The std weighted risk is (%d): %f and the turbo weighted risk is(%d): %f' % (stdCount, stdRiskWeighted, turboCount, turboRiskWeighted))

The std cummulative risk is: 146 and the turbo cummulative risk is: 25
The std weighted risk is (168): 0.869048 and the turbo weighted risk is(37): 0.675676


## Goal 2 - Evaluate Cross Val Model

In [7]:
import cPickle
import json
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from score_auto_gbm.FeatureTransformer import FeatureTransformer

In [14]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [8]:
with open("./score_auto_gbm/gbmFit.pkl", "rb") as pickle_file:
    gbmFit = cPickle.load(pickle_file)

In [20]:
data = pd.read_csv('../data/imports-85.data', header=None)
data.columns = [u'risk', u'normalizedLosses', u'make', u'fuelType', u'aspiration', u'numDoors',
       u'bodyStyle', u'driveWheels', u'engineLocation', u'wheelBase',
       u'length', u'width', u'height', u'curbWeight', u'engineType',
       u'numCylinders', u'engineSize', u'fuelSystem', u'bore', u'stroke',
       u'compressionRatio', u'horsepower', u'peakRPM', u'cityMPG', u'highwayMPG', u'price']
data.head()

Unnamed: 0,risk,normalizedLosses,make,fuelType,aspiration,numDoors,bodyStyle,driveWheels,engineLocation,wheelBase,...,engineSize,fuelSystem,bore,stroke,compressionRatio,horsepower,peakRPM,cityMPG,highwayMPG,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [21]:
trainData = data.ix[:,2:]

trainData.head()

Unnamed: 0,make,fuelType,aspiration,numDoors,bodyStyle,driveWheels,engineLocation,wheelBase,length,width,...,engineSize,fuelSystem,bore,stroke,compressionRatio,horsepower,peakRPM,cityMPG,highwayMPG,price
0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [37]:
squareError = float(0)

for idx, row in enumerate(trainData.to_dict(orient='records')):
    try:
        prediction = list(gbmFit.predict(pd.DataFrame([row])))[0]
    except ValueError:
        continue
    
    if is_number(data.ix[idx,0]) == True and is_number(prediction) == True:
        squareError += (float(prediction) - float(data.ix[idx,0]))**2

number = trainData.index.size
rmse = (squareError/trainData.index.size)**0.5

print ('The number of instances was: %d.\n The squaured error was: %f.\n RMSE: %f.' % (number,squareError,rmse))


The number of instances was: 205.
 The squaured error was: 10.487928.
 RMSE: 0.226187.


In [35]:
trainData.index.size

205

In [7]:
trainData.head().to_dict(orient='records')

[{u'aspiration': '?',
  u'bodyStyle': 'alfa-romero',
  u'bore': 'gas',
  u'cityMPG': 'std',
  u'compressionRatio': 'two',
  u'curbWeight': 'convertible',
  u'driveWheels': 'rwd',
  u'engineLocation': 'front',
  u'engineSize': 88.6,
  u'engineType': 168.8,
  u'fuelSystem': 64.1,
  u'fuelType': 48.8,
  u'height': 2548L,
  u'highwayMPG': 'dohc',
  u'horsepower': 'four',
  u'length': 130L,
  u'make': 'mpfi',
  u'numCylinders': '3.47',
  u'numDoors': '2.68',
  u'peakRPM': 9.0,
  u'price': '111',
  u'risk': '5000',
  u'stroke': 21L,
  u'wheelBase': 27L,
  u'width': '13495'},
 {u'aspiration': '?',
  u'bodyStyle': 'alfa-romero',
  u'bore': 'gas',
  u'cityMPG': 'std',
  u'compressionRatio': 'two',
  u'curbWeight': 'convertible',
  u'driveWheels': 'rwd',
  u'engineLocation': 'front',
  u'engineSize': 88.6,
  u'engineType': 168.8,
  u'fuelSystem': 64.1,
  u'fuelType': 48.8,
  u'height': 2548L,
  u'highwayMPG': 'dohc',
  u'horsepower': 'four',
  u'length': 130L,
  u'make': 'mpfi',
  u'numCylinders

## Goal 3 - Use munging & math to evaluate how good RMSE was