In [12]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn.linear_model  as sk  # namespace conflict with pyspark
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold , cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
from helper import *
import tensorflow as tf

pd.options.display.max_columns = None
import findspark
import pyspark
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.sql.functions import col

from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator

from keras.layers import Dense
from keras.models import Model, Sequential
from keras import initializers, optimizers
from keras import backend as K

# Stuff that needs to be done

- Find another framework.
    - Sklearn
    - Spark
    - Tensorflow/R
- Algorithms
    - Linear Regression
    - Logistic Regression
    - Maybe SVM /Random forest
- Find two metrics 
    - RMSE 
    - Some other
- Do 70/30 split training and 10 cross validation
    - only did cross validation
- Find another dataset 
    - Sum data with noise 
    -  Maybe housing data set
    

## Import DataSet

In [13]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_price_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAAVLZzU4E7ro0BiRzPG3pP8a/House%20Prices?dl=1"
all_urls = [sumdata_url, housing_price_url]

In [14]:
get_data(all_urls) # retrieves the data if there is no data folder

In [15]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv"
housing_price_path = "data/housing dataset.csv" # has more than 30 features
# need one more
# what a brilliant idea to name files with space

In [16]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

## Load datasets sum_noise

In [17]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")
sumdata_noise.head(n=1)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,62485,58472,84200,86181,75529,136939,150633,230058,246491,257336,1352179,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


### Create same base line for Spark, Keras, Sklearn implementation
- Using the same feature scaled data for all implementation
- Same number iterations
- Same cost function for minimising(for gradient descent)

In [18]:
# Remove 'Instance' as it simply represents the row number



sumdata_noise.drop('Instance', axis = 1)

# Extract 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise.loc[:, ['Noisy Target']]
# Extract 'Nosiy Target Class' as regression target
sumdata_noise_classif_Y = sumdata_noise.loc[:, ['Noisy Target Class']]

# Extract rest columns as explananatory variables
sumdata_noise_X =sumdata_noise.drop(['Instance','Noisy Target Class'
                                    ], axis = 1)


### Feature scaling
- Keeping them as dataframes (transform returns ndarray)

In [19]:
scX = StandardScaler()
scY = StandardScaler()
columns = sumdata_noise_X.columns

sumdata_noise_X[columns] = scX.fit_transform(sumdata_noise_X)
columns = sumdata_noise_reg_Y.columns
sumdata_noise_reg_Y[columns] = scX.fit_transform(sumdata_noise_reg_Y)


sumdata_noise_scaled = pd.concat([sumdata_noise_X,sumdata_noise_reg_Y])

### Linear Regression using pyspark

In [20]:
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)


In [21]:
def diff_sq(n):
    return (n[0] - n[1])**2

In [22]:
def sum_1(n,n2):
    return (n + n2)

In [57]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 
def ols_error(y_true, y_pred):
    return K.sum(K.square(y_pred - y_true), axis=-1)

def keras_linear_regression_sgd(X_train, X_test, y_train, y_test):
    """
        learning rate 0.001
        epochs 15
        batch size 1
    """
    X_train = X_train.as_matrix()
    X_test = X_test.as_matrix()
    y_train = y_train.as_matrix()
    y_test = y_test.as_matrix()
    
    cols = X_train.shape[1] # no of training examples
    
    model = Sequential([
        Dense(1, activation='linear', input_dim=cols)
    ])

    sgd = optimizers.SGD(lr=0.001)
    model.compile(loss=ols_error, optimizer=sgd)

    model.fit(X_train, y_train, epochs=15, batch_size=1,verbose=0) 
    score = model.evaluate(X_test, y_test)

    return score

### Things to test out
- epoch and batches in sklearn sgd vs keras sgd
- all the loss functions used for gradient descent make sure they are same

In [24]:
from math import sqrt
def spark_linear_regression_sgd( X_train, X_test, y_train, y_test):
    """
        This is using RDD API, which is going to be deprecated 
    
    """
    train_data = pd.concat([X_train,y_train])
    test_data  = pd.concat([X_test,y_test])
    
    # converting them into spark dataframe
    train_data = sqlContext.createDataFrame(train_data)
    test_data = sqlContext.createDataFrame(test_data) 
    
    # converting
    train_data = train_data.rdd\
      .map(lambda line:LabeledPoint(line[-1],line[1:-1]))
    
    test_data = test_data.rdd\
        .map(lambda line:LabeledPoint(line[-1],line[1:-1]))
    
    lr = LinearRegressionWithSGD()

    # Fit 2 models, using different regularization parameters
    model = lr.train(train_data,iterations=10, step = 0.001)
    
    # Evaluate the model on test data
    valuesAndPreds = train_data.map(lambda p: (p.label, model.predict(p.features))) 
     
    RMSE = sqrt(valuesAndPreds\
        .map(diff_sq)\
        .reduce(sum_1) / valuesAndPreds.count()
              )
  

    return RMSE

In [25]:
def spark_linear_regression( X_train, X_test, y_train, y_test):
    """
        this uses the dataframe api
    """
    train_data = pd.concat([X_train,y_train])
    test_data  = pd.concat([X_test,y_test])
    
    # converting them into spark dataframe
    train_data = sqlContext.createDataFrame(train_data)
    test_data = sqlContext.createDataFrame(test_data) 
    
    # converting
    train_data = train_data.rdd\
        .map(lambda line:LabeledPoint(line[-1],line[1:-1]))\
        .toDF()
    
    test_data = test_data.rdd\
        .map(lambda line:LabeledPoint(line[-1],line[1:-1]))\
        .toDF()
 
    
    # converting into vector type to make it suitable for ml library
    as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())
    train_data = train_data.withColumn("features", as_ml("features"))
    test_data  = test_data.withColumn("features", as_ml("features"))
    lr = LinearRegression( solver="normal")

    # Fit 2 models, using different regularization parameters
    modelA = lr.fit(train_data, {lr.regParam:0.0})
    modelB = lr.fit(train_data, {lr.regParam:100.0})
    
    
    predictionsA = modelA.transform(test_data)
    
    evaluator = RegressionEvaluator(metricName="rmse")
    RMSE = evaluator.evaluate(predictionsA)
    
    return RMSE
    

In [30]:
def sklearn_linear_regression_sgd( X_train, X_test, y_train, y_test): 
    """
       learning rate 0.001
       
    """
    clf = sk.SGDRegressor(learning_rate='constant', eta0=0.001)
    model = clf.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)**0.5

In [31]:
def sklearn_linear_regression( X_train, X_test, y_train, y_test): 
    """
        This might not be much of use as I wasn't able to control 
        the learning rate, number of iterations
    """
    lm = sk.LinearRegression()
    model = lm.fit(X_train, y_train)
    predictions = lm.predict(X_test) 
    return mean_squared_error(y_test, predictions)**0.5

In [32]:
def model_test_cross( data_x, data_y, dataset_name, algorithm): 
    algorithm_n = algorithm.__name__
    errors = np.zeros((1, 10)) #  
    for chunk in data_chunks:
        # if chunk is greater than the no. of examples
        if chunk > sumdata_noise.shape[0]: 
            break

        X = data_x.head(n = chunk)
        y = data_y.head(n = chunk) 
        
        kf = KFold(n_splits=10)
        
        
        for (i, (train_index, test_index)) in enumerate(kf.split(X)):
            
            
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            # accumulating errors for each fold
            errors[0][i] = algorithm( X_train, X_test, y_train, y_test) 
        
        # calculates mean across rows
      
        mean_error = np.mean(errors, axis=0)
         
        
        print ("Algorithm: {}  Data set: {} Chunk Size: {} Error: {}".format(algorithm_n,
                                                                             dataset_name,
                                                                             chunk, mean_error[0]))
        

In [55]:
model_test_cross( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", sklearn_linear_regression_sgd)

  y = column_or_1d(y, warn=True)


Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100 Error: 0.07171844094857885
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 500 Error: 0.1055249649382724
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 1000 Error: 0.10624172349819838
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 5000 Error: 0.08136902161125027
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 10000 Error: 0.05886943699884657
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 50000 Error: 0.0069231202508455774
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100000 Error: 0.0011749624297445987
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 500000 Error: 0.0008308767998693266


In [27]:
model_test_cross( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", spark_linear_regression)

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
Algorithm: spark_linear_regression  Data set: Sum data with noise Chunk Size: 100 Error: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
Algorithm: spark_linear_regression  Data set: Sum data with noise Chunk Size: 500 Error: nan
nan
nan
nan
nan
nan
nan


KeyboardInterrupt: 

In [None]:
model_test_cross( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", keras_linear_regression_sgd)

Algorithm: keras_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100 Error: 0.02451152913272381
 32/500 [>.............................] - ETA: 1sAlgorithm: keras_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 5000 Error: 0.0013573664715513588
  32/1000 [..............................] - ETA: 3sAlgorithm: keras_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 10000 Error: 1.028794116791687e-06