In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn  as sk  # namespace conflict with pyspark
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold , cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
from helper import *
import tensorflow as tf

pd.options.display.max_columns = None
import findspark
import pyspark as pk
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.sql.functions import col

from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator

from keras.layers import Dense
from keras.models import Model, Sequential
from keras import initializers, optimizers
from keras import backend as K

from math import sqrt

Using TensorFlow backend.


### Frame works used

- Keras
- Spark
- R
- Sklearn
 
 
### Find another dataset  
- Maybe housing data set
    

## Import DataSet

In [4]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAD6JGlvG5XADIjg9SCojvpya/House%20Sales%20in%20King%20County%2C%20USA?dl=1&preview=kc_house_data.csv"
all_urls = [sumdata_url,housing_url]

In [5]:
get_data(all_urls) # retrieves the data if there is no data folder

In [7]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv" 
housing_price_path ="data/kc_house_data.csv" # has more than 30 features


## Sum Data restricted to 100,000


In [8]:
SUM_DATA_SIZE = 100000

## Housing restricted to 10,000

In [13]:
HOUSE_DATA_SIZE =10000

## Load datasets sum_noise

In [14]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")
sumdata_noise = sumdata_noise.head(n=SUM_DATA_SIZE)
sumdata_noise.head(n=1)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,62485,58472,84200,86181,75529,136939,150633,230058,246491,257336,1352179,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


### Create same base line for Spark, Keras, Sklearn implementation
- Using the same feature scaled data for all implementation
- Same number iterations
- Same cost function for minimising(for gradient descent)

In [15]:
# Remove 'Instance' as it simply represents the row number

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

sumdata_noise.drop('Instance', axis = 1)

# Extract 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise.loc[:, ['Noisy Target']]
# Extract 'Nosiy Target Class' as regression target
sumdata_noise_classif_Y = sumdata_noise.loc[:, ['Noisy Target Class']] 
 
 
 
sumdata_noise_classif_Y ['Noisy Target Class'] = le.fit_transform( sumdata_noise_classif_Y ['Noisy Target Class'].astype('str')) 
 
# Extract rest columns as explananatory variables
sumdata_noise_X =sumdata_noise.drop(['Instance','Noisy Target Class','Noisy Target'
                                    ], axis = 1) 
# our SPARK NEEDS target column to be named "label"
sumdata_noise_reg_Y.rename(columns={'Noisy Target': 'label'}, inplace=True)
sumdata_noise_classif_Y.rename(columns={'Noisy Target Class': 'label'}, inplace=True)



### Feature scaling
- Keeping them as dataframes (transform returns ndarray)

In [16]:
scX = StandardScaler()
scY = StandardScaler()
columns = sumdata_noise_X.columns

sumdata_noise_X[columns] =  scX.fit_transform(sumdata_noise_X)
columns = sumdata_noise_reg_Y.columns
sumdata_noise_reg_Y[columns] = scX.fit_transform(sumdata_noise_reg_Y)


sumdata_noise_scaled = pd.concat([sumdata_noise_X,sumdata_noise_reg_Y])

## Load House Price dataset

In [17]:
housing_price = pd.read_csv(housing_price_path, delimiter=",")
housing_price = housing_price.head(n=HOUSE_DATA_SIZE)
housing_price.head(n=1)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650


## Preprocess housing price dataset

- Remove 'Id' as it simply represents the row number
- Use 'price' as regression target
- Use 'SaleCondition' as classification target
- For explananatory variables, use the following numerical variable and categorical variables
    - Numerical
        - bedrooms
        - bathrooms
        - sqft_living
        - sqft_lot
        - condition
        - sqft_above
        - yr_built
        - sqft_living15
        - sqft_lot15
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array

In [20]:
#Use 'price' as regression target 
housing_price_reg_Y = housing_price.loc[:, ['price']]

# Use following features to predict price
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'condition', 'sqft_above', 'yr_built',
                      'sqft_living15', 'sqft_lot15']

filtered_table = housing_price[features]

# Apply Feature Scaling to the classification variable
scX = StandardScaler()
scY = StandardScaler()

housing_price_reg_X = filtered_table
columns = filtered_table.columns
housing_price_reg_X[columns] = scX.fit_transform(housing_price_reg_X)
columns = housing_price_reg_Y.columns
housing_price_reg_Y[columns] = scX.fit_transform(housing_price_reg_Y) 

KeyError: "None of [['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'condition', 'sqft_above', 'yr_built', 'sqft_living15', 'sqft_lot15']] are in the [index]"

### Linear Regression using pyspark

In [14]:
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)

In [97]:
def spark_linear_regression( X_train, X_test, y_train, y_test):
    """
        learning rate: cant configure it
        epochs: 10 (maxiter)
        batch size 1 
        objective function is ordinary least squared error
        root mean squared error for evaluating testdata
    """
    from pyspark.ml import Pipeline  
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    train_data = pd.concat([X_train,y_train], axis =1)
    test_data  = pd.concat([X_test,y_test], axis =1)
    
 
    train_data = sqlContext.createDataFrame(train_data)
    test_data = sqlContext.createDataFrame(test_data) 
   
   
    assembler = (VectorAssembler()
        .setInputCols(train_data.columns[:-1]) # everything excluding target
        .setOutputCol("features"))
 
    lr = pk.ml.regression.LinearRegression(maxIter=10)

    pipeline = Pipeline(stages=[assembler, lr])
    model = pipeline.fit(train_data)
    
    
    predictions = model.transform(test_data) 
    
    #predictions.select("predictions", "label").toPandas()  
    evaluator = RegressionEvaluator(metricName="rmse")
    RMSE = evaluator.evaluate(predictions)
    evaluator = RegressionEvaluator(metricName="mae")
    MAE = evaluator.evaluate(predictions)
  
    return RMSE,MAE
    

In [112]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 
def ols_error(y_true, y_pred):
    return K.sum(K.square(y_pred - y_true), axis=-1)

def keras_linear_regression(X_train, X_test, y_train, y_test):
    """
        learning rate 0.001
        epochs 10
        batch size 1 
        objective function is ordinary least squared error
        root mean squared error for evaluating testdata
    """
    X_train = X_train.as_matrix()
    X_test = X_test.as_matrix()
    y_train = y_train.as_matrix()
    y_test = y_test.as_matrix()
    
    cols = X_train.shape[1] # no of training examples
    
    model = Sequential([
        Dense(1, activation='linear', input_dim=cols)
    ])


    sgd=optimizers.SGD(lr=0.001)
    model.compile(optimizer=sgd ,loss=ols_error)
    model.fit(X_train,y_train, epochs=10, shuffle=False, verbose=0)
    preds = model.predict( X_test, verbose=0)
    
    mae =  sk.metrics.mean_absolute_error(y_test, preds)
    rmse = sk.metrics.mean_squared_error(y_test, preds)**0.5
 
    return (rmse,mae)

In [105]:
def sklearn_linear_regression( X_train, X_test, y_train, y_test): 
    """
        learning rate 0.001
        epochs 10
        batch size 1
        objective function is ordinary least squared error
        root mean squared error for evaluating testdata
       
    """
    clf = sk.linear_model.SGDRegressor(learning_rate='constant', eta0=0.001,n_iter=10)
    model = clf.fit(X_train, y_train)
    preds = model.predict(X_test)
    mae =  sk.metrics.mean_absolute_error(y_test, preds)
    rmse = sk.metrics.mean_squared_error(y_test, preds)**0.5
    return (rmse,mae)

In [86]:
def model_cross_validation( X, y, dataset_name, kfolds,algorithm): 
    kf = KFold(n_splits=kfolds)
    rmse = np.zeros((kfolds,1))
    mae = np.zeros((kfolds,1))
     
    
    algorithm_n = algorithm.__name__ 
 

    for (i, (train_index, test_index)) in enumerate(kf.split(X)):


        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        

        # accumulating errors for each fold 
       
       
        rmse[i], mae[i] = algorithm( X_train, X_test, y_train, y_test)   # RMSE https://www.kaggle.com/wiki/RootMeanSquaredError
    # calculates mean across rows
    print ("Dataset: {}\nRMSE: {}\nMAE: {}\n".format( dataset_name, rmse.mean(), mae.mean()))
        

In [87]:
model_cross_validation( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise",10 ,sklearn_linear_regression)

  y = column_or_1d(y, warn=True)


Dataset: Sum data with noise
RMSE: 0.12082316746374525
MAE: 0.09157470959364587



In [103]:
model_cross_validation( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise",10 , spark_linear_regression)

Dataset: Sum data with noise
RMSE: 0.11997793554549654
MAE: 0.089825622765171



In [104]:
model_cross_validation( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise",10, keras_linear_regression)

Dataset: Sum data with noise
RMSE: 0.12099053657889425
MAE: 0.09056510972566621



### 70/30 split 

In [109]:
def model_test_split( X, y, dataset_name, algorithm): 
    algorithm_n = algorithm.__name__

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
    rmse,mae = algorithm( X_train, X_test, y_train, y_test)              
    print ("Algorithm: {}\nDataset: {}\nRMSE: {}\nMAE: {}\n".format( algorithm_n,
                                                                     dataset_name,
                                                                     rmse,
                                                                     mae))

In [110]:
model_test_split( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", sklearn_linear_regression)

Algorithm: sklearn_linear_regression
Dataset: Sum data with noise
RMSE: 0.12022280408086701
MAE: 0.09026104039063863



  y = column_or_1d(y, warn=True)


In [111]:
model_test_split( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", spark_linear_regression)

Algorithm: spark_linear_regression
Dataset: Sum data with noise
RMSE: 0.11993502599931348
MAE: 0.089765625839784



In [113]:
model_test_split( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", keras_linear_regression)

Algorithm: keras_linear_regression
Dataset: Sum data with noise
RMSE: 0.12412924843785682
MAE: 0.09206387844999342

