In [34]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn.linear_model  as sk  # namespace conflict with pyspark
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold , cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
from helper import *
import tensorflow as tf

pd.options.display.max_columns = None
import findspark
import pyspark as pk
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.sql.functions import col

from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator

from keras.layers import Dense
from keras.models import Model, Sequential
from keras import initializers, optimizers
from keras import backend as K

from math import sqrt

# Stuff that needs to be done

- Find another framework.
    - Sklearn
    - Spark
    - Tensorflow/R
- Algorithms
    - Linear Regression
    - Logistic Regression
    - Maybe SVM /Random forest
- Find two metrics 
    - RMSE 
    - Some other
- Find another dataset 
    - Sum data with noise 
    -  Maybe housing data set
    

## Import DataSet

In [4]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_price_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAAVLZzU4E7ro0BiRzPG3pP8a/House%20Prices?dl=1"
all_urls = [sumdata_url, housing_price_url]

In [5]:
get_data(all_urls) # retrieves the data if there is no data folder

In [40]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv"
housing_price_path = "data/housing dataset.csv" # has more than 30 features
# need one more
# what a brilliant idea to name files with space

## Sum Data set size 100,000

In [41]:
SUM_DATA_SIZE = 100000

## Load datasets sum_noise

In [42]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")
sumdata_noise = sumdata_noise.head(n=SUM_DATA_SIZE)
sumdata_noise.head(n=1)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,62485,58472,84200,86181,75529,136939,150633,230058,246491,257336,1352179,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


### Create same base line for Spark, Keras, Sklearn implementation
- Using the same feature scaled data for all implementation
- Same number iterations
- Same cost function for minimising(for gradient descent)

In [43]:
# Remove 'Instance' as it simply represents the row number

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

sumdata_noise.drop('Instance', axis = 1)

# Extract 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise.loc[:, ['Noisy Target']]
# Extract 'Nosiy Target Class' as regression target
sumdata_noise_classif_Y = sumdata_noise.loc[:, ['Noisy Target Class']] 
 
 
 
sumdata_noise_classif_Y ['Noisy Target Class'] = le.fit_transform( sumdata_noise_classif_Y ['Noisy Target Class'].astype('str')) 
 
# Extract rest columns as explananatory variables
sumdata_noise_X =sumdata_noise.drop(['Instance','Noisy Target Class','Noisy Target'
                                    ], axis = 1) 
# our SPARK NEEDS target column to be named "label"
sumdata_noise_reg_Y.rename(columns={'Noisy Target': 'label'}, inplace=True)
sumdata_noise_classif_Y.rename(columns={'Noisy Target Class': 'label'}, inplace=True)



### Feature scaling
- Keeping them as dataframes (transform returns ndarray)

In [44]:
scX = StandardScaler()
scY = StandardScaler()
columns = sumdata_noise_X.columns

sumdata_noise_X[columns] =  scX.fit_transform(sumdata_noise_X)
columns = sumdata_noise_reg_Y.columns
sumdata_noise_reg_Y[columns] = scX.fit_transform(sumdata_noise_reg_Y)


sumdata_noise_scaled = pd.concat([sumdata_noise_X,sumdata_noise_reg_Y])

### Linear Regression using pyspark

In [14]:
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)


In [48]:
def spark_linear_regression( X_train, X_test, y_train, y_test):
    """
        this uses the dataframe api
    """
    from pyspark.ml import Pipeline  
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    train_data = pd.concat([X_train,y_train], axis =1)
    test_data  = pd.concat([X_test,y_test], axis =1)
    
 
    train_data = sqlContext.createDataFrame(train_data)
    test_data = sqlContext.createDataFrame(test_data) 
   
   
    assembler = (VectorAssembler()
        .setInputCols(train_data.columns[:-1]) # everything excluding target
        .setOutputCol("features"))
 
    lr = pk.ml.regression.LinearRegression()

    pipeline = Pipeline(stages=[assembler, lr])
    model = pipeline.fit(train_data)
    
    
    predictions = model.transform(test_data) 
    evaluator = RegressionEvaluator(metricName="rmse")
    RMSE = evaluator.evaluate(predictions)
    return RMSE
    

In [29]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 
def ols_error(y_true, y_pred):
    return K.sum(K.square(y_pred - y_true), axis=-1)

def keras_linear_regression_sgd(X_train, X_test, y_train, y_test):
    """
        learning rate 0.001
        epochs 1
        batch size 1
        root mean squared error for evaluating testdata
        objective function is ordinary least squared error
    """
    X_train = X_train.as_matrix()
    X_test = X_test.as_matrix()
    y_train = y_train.as_matrix()
    y_test = y_test.as_matrix()
    
    cols = X_train.shape[1] # no of training examples
    
    model = Sequential([
        Dense(1, activation='linear', input_dim=cols)
    ])


    sgd=optimizers.SGD(lr=0.001)
    model.compile(optimizer=sgd ,loss=root_mean_squared_error)
    model.fit(X_train,y_train, epochs=1, shuffle=False, verbose=0)
    preds = model.predict( X_test, verbose=0)
    
    #score = model.evaluate(X_test, y_test,verbose=0)

    return mean_squared_error(y_test, preds)**0.5

### Things to test out
- epoch and batches in sklearn sgd vs keras sgd

In [30]:
def sklearn_linear_regression_sgd( X_train, X_test, y_train, y_test): 
    """
        learning rate 0.001
        epochs ?
        batch size ?
        root mean squared error for evaluating testdata
       
    """
    clf = sk.SGDRegressor(learning_rate='constant', eta0=0.001)
    model = clf.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)**0.5

In [31]:
def model_test_cross( data_x, data_y, dataset_name, algorithm): 
    algorithm_n = algorithm.__name__
    errors = np.zeros((1, 10)) #  
    for chunk in data_chunks:
        # if chunk is greater than the no. of examples
        if chunk > sumdata_noise.shape[0]: 
            break

        X = data_x.head(n = chunk)
        y = data_y.head(n = chunk) 
        
        kf = KFold(n_splits=10)
        
        
        for (i, (train_index, test_index)) in enumerate(kf.split(X)):
            
            
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            # accumulating errors for each fold
            errors[0][i] = algorithm( X_train, X_test, y_train, y_test) 
        
        # calculates mean across rows
      
        mean_error = np.mean(errors, axis=0)
         
        
        print ("Algorithm: {}  Data set: {} Chunk Size: {} Error: {}".format(algorithm_n,
                                                                             dataset_name,
                                                                             chunk, mean_error[0]))
        

In [23]:
model_test_cross( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", sklearn_linear_regression_sgd)

  y = column_or_1d(y, warn=True)


Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100 Error: 0.0820319460064361
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 500 Error: 0.12281878960246315
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 1000 Error: 0.12490249652357245
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 5000 Error: 0.12262182845357127
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 10000 Error: 0.11845299163109775
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 50000 Error: 0.11913036630681935
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100000 Error: 0.12217924257008063


In [49]:
model_test_cross( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", spark_linear_regression)

Algorithm: spark_linear_regression  Data set: Sum data with noise Chunk Size: 100 Error: 0.05459002474018433
Algorithm: spark_linear_regression  Data set: Sum data with noise Chunk Size: 500 Error: 0.12149470663180127
Algorithm: spark_linear_regression  Data set: Sum data with noise Chunk Size: 1000 Error: 0.12265910595453326
Algorithm: spark_linear_regression  Data set: Sum data with noise Chunk Size: 5000 Error: 0.12148908721999893
Algorithm: spark_linear_regression  Data set: Sum data with noise Chunk Size: 10000 Error: 0.11830148154531654
Algorithm: spark_linear_regression  Data set: Sum data with noise Chunk Size: 50000 Error: 0.11904218848285923
Algorithm: spark_linear_regression  Data set: Sum data with noise Chunk Size: 100000 Error: 0.11972228305364346


In [66]:
model_test_cross( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", keras_linear_regression_sgd)

Algorithm: keras_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100 Error: 0.6988921384122013
Algorithm: keras_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 500 Error: 2.1474811607486948
Algorithm: keras_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 1000 Error: 0.46595074518216867
Algorithm: keras_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 5000 Error: 0.28701945681251456
Algorithm: keras_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 10000 Error: 0.42347089941847105
Algorithm: keras_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 50000 Error: 0.32804712913819417


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

### 70/30 split 

In [17]:
def model_test_split( data_x, data_y, dataset_name, algorithm): 
    algorithm_n = algorithm.__name__
  
    for chunk in data_chunks:
        # if chunk is greater than the no. of examples
        if chunk > sumdata_noise.shape[0]: 
            break

        X = data_x.head(n = chunk)
        y = data_y.head(n = chunk) 
        
        kf = KFold(n_splits=10)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
        
       
            # accumulating errors for each fold
        error = algorithm( X_train, X_test, y_train, y_test) 
        
        print ("Algorithm: {}  Data set: {} Chunk Size: {} Error: {}".format(algorithm_n,
                                                                             dataset_name,
                                                                             chunk, error))
        

In [18]:
model_test_split( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", sklearn_linear_regression_sgd)

  y = column_or_1d(y, warn=True)


Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100 Error: 0.12851116699238616
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 500 Error: 0.11938306709459479
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 1000 Error: 0.1053070868693961
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 5000 Error: 0.08468309994156151
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 10000 Error: 0.06733267756643017
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 50000 Error: 0.012541256149980499
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100000 Error: 0.0020504251684084897
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 500000 Error: 0.0008324379449694423
