In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
import pandas as pd
from helper import *
import tensorflow as tf

pd.options.display.max_columns = None
import findspark
import pyspark
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import col


## Import DataSet

In [3]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_price_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAAVLZzU4E7ro0BiRzPG3pP8a/House%20Prices?dl=1"
all_urls = [sumdata_url, housing_price_url]

In [4]:
get_data(all_urls) # retrieves the data if there is no data folder

In [5]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv"
housing_price_path = "data/housing dataset.csv" # has more than 30 features
# need one more
# what a brilliant idea to name files with space

In [6]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

## Load datasets sum_noise

In [7]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")
sumdata_noise.head(n=1)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,66957,74432,96087,103120,64272,150633,181787,180349,216912,304071,1434819,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [9]:
# Remove 'Instance' as it simply represents the row number
sumdata_noise.drop('Instance', axis = 1)

# Extract 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise['Noisy Target']

sumdata_noise_reg_Y = sumdata_noise_reg_Y.values.reshape(sumdata_noise_reg_Y.shape[0],1)
# Extract 'Nosiy Target Class' as regression target
sumdata_noise_classif_Y = sumdata_noise['Noisy Target Class']

# Extract rest columns as explananatory variables
sumdata_noise_X = sumdata_noise.iloc[:, 1:-2].as_matrix()

smalldata = sumdata_noise.drop('Noisy Target Class', axis=1)[:100]
smalldata.head()

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target
0,1,66957,74432,96087,103120,64272,150633,181787,180349,216912,304071,1434819
1,2,96030,86875,108299,148025,16965,253819,258672,268851,404599,543092,2148748
2,3,26212,23398,27668,39678,23062,65873,65660,68508,82617,115418,476405
3,4,28363,33381,42447,35270,8980,52885,79144,85741,86806,147368,635169
4,5,38960,50255,79879,91885,64037,127193,115760,174069,184805,250659,1221471


In [10]:
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)


In [11]:
# converting pandas -> spark dataframe
 

#data_df = sqlContext.createDataFrame(sumdata_noise.drop('Noisy Target Class', axis=1))
data_df = sqlContext.createDataFrame(smalldata)

In [12]:
input_features = list(sumdata_noise.columns)[:-2]
ouput_label = list(sumdata_noise.columns)[-2]

##### http://www.techpoweredmath.com/spark-dataframes-mllib-tutorial/

In [14]:
data = data_df.select(input_features)\
  .rdd\
  .map(lambda line:LabeledPoint(line[-1],line[1:-1]))\
  .toDF()

In [15]:
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf

as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())
result = data.withColumn("features", as_ml("features"))
result

DataFrame[features: vector, label: double]

In [19]:
lr = LinearRegression()

# Fit 2 models, using different regularization parameters
modelA = lr.fit(result, {lr.regParam:0.0})
modelB = lr.fit(result, {lr.regParam:100.0})

In [20]:
predictionsA = modelA.transform(result)

In [21]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse")
RMSE = evaluator.evaluate(predictionsA)
print("ModelA: Root Mean Squared Error = " + str(RMSE))

ModelA: Root Mean Squared Error = 32485.121362949645


In [23]:
predictionsB = modelB.transform(result)
RMSE = evaluator.evaluate(predictionsB)
print("ModelB: Root Mean Squared Error = " + str(RMSE))

ModelB: Root Mean Squared Error = 32486.176304824425
