In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn.linear_model  as sk  # namespace conflict with pyspark
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold , cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
from helper import *
import tensorflow as tf

pd.options.display.max_columns = None
import findspark
import pyspark
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.sql.functions import col

from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator

from keras.layers import Dense
from keras.models import Model, Sequential
from keras import initializers, optimizers
from keras import backend as K

Using TensorFlow backend.


# Stuff that needs to be done

- Find another framework.
    - Sklearn
    - Spark
    - Tensorflow/R
- Algorithms
    - Linear Regression
    - Logistic Regression
    - Maybe SVM /Random forest
- Find two metrics 
    - RMSE 
    - Some other
- Find another dataset 
    - Sum data with noise 
    -  Maybe housing data set
    

## Import DataSet

In [3]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_price_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAAVLZzU4E7ro0BiRzPG3pP8a/House%20Prices?dl=1"
all_urls = [sumdata_url, housing_price_url]

In [4]:
get_data(all_urls) # retrieves the data if there is no data folder

In [5]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv"
housing_price_path = "data/housing dataset.csv" # has more than 30 features
# need one more
# what a brilliant idea to name files with space

In [6]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

## Load datasets sum_noise

In [7]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")
sumdata_noise.head(n=1)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,62485,58472,84200,86181,75529,136939,150633,230058,246491,257336,1352179,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


### Create same base line for Spark, Keras, Sklearn implementation
- Using the same feature scaled data for all implementation
- Same number iterations
- Same cost function for minimising(for gradient descent)

In [29]:
# Remove 'Instance' as it simply represents the row number



sumdata_noise.drop('Instance', axis = 1)

# Extract 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise.loc[:, ['Noisy Target']]
# Extract 'Nosiy Target Class' as regression target
sumdata_noise_classif_Y = sumdata_noise.loc[:, ['Noisy Target Class']] 
from sklearn import preprocessing
le = preprocessing.LabelEncoder() 
 
sumdata_noise_classif_Y ['Noisy Target Class'] = le.fit_transform( sumdata_noise_classif_Y ['Noisy Target Class'].astype('str')) 

# Extract rest columns as explananatory variables
sumdata_noise_X =sumdata_noise.drop(['Instance','Noisy Target Class'
                                    ], axis = 1) 


### Feature scaling
- Keeping them as dataframes (transform returns ndarray)

In [31]:
scX = StandardScaler()
scY = StandardScaler()
columns = sumdata_noise_X.columns

sumdata_noise_X[columns] = scX.fit_transform(sumdata_noise_X)
# columns = sumdata_noise_reg_Y.columns
# sumdata_noise_reg_Y[columns] = scX.fit_transform(sumdata_noise_reg_Y)


sumdata_noise_scaled = pd.concat([sumdata_noise_X,sumdata_noise_reg_Y])

In [32]:
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)


In [37]:
def spark_logistic_regression( X_train, X_test, y_train, y_test):
    """
        this uses the dataframe api
    """
    from pyspark.ml.classification import LogisticRegression
    train_data = pd.concat([X_train,y_train])
    test_data  = pd.concat([X_test,y_test])
    
    # converting them into spark dataframe
    train_data = sqlContext.createDataFrame(train_data)
    test_data = sqlContext.createDataFrame(test_data) 
    
    # converting
    train_data = train_data.rdd\
        .map(lambda line:LabeledPoint(line[-1],line[1:-1]))\
        .toDF()
    
    test_data = test_data.rdd\
        .map(lambda line:LabeledPoint(line[-1],line[1:-1]))\
        .toDF()
 
    
    # converting into vector type to make it suitable for ml library
#     as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())
#     train_data = train_data.withColumn("features", as_ml("features"))
#     test_data  = test_data.withColumn("features", as_ml("features"))
    lr = LogisticRegression()

    # Fit 2 models, using different regularization parameters
    modelA = lr.fit(train_data)
    #modelB = lr.fit(train_data, {lr.regParam:100.0})
    
    
    predictionsA = modelA.transform(test_data)
    
    evaluator = RegressionEvaluator(metricName="rmse")
    RMSE = evaluator.evaluate(predictionsA)
    
    return RMSE

In [38]:
def model_test_cross( data_x, data_y, dataset_name, algorithm): 
    algorithm_n = algorithm.__name__
    errors = np.zeros((1, 10)) #  
    for chunk in data_chunks:
        # if chunk is greater than the no. of examples
        if chunk > sumdata_noise.shape[0]: 
            break

        X = data_x.head(n = chunk)
        y = data_y.head(n = chunk) 
        
        kf = KFold(n_splits=10)
        
        
        for (i, (train_index, test_index)) in enumerate(kf.split(X)):
            
            
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            # accumulating errors for each fold
            errors[0][i] = algorithm( X_train, X_test, y_train, y_test) 
        
        # calculates mean across rows
      
        mean_error = np.mean(errors, axis=0)
         
        
        print ("Algorithm: {}  Data set: {} Chunk Size: {} Error: {}".format(algorithm_n,
                                                                             dataset_name,
                                                                             chunk, mean_error[0]))
        

In [39]:
model_test_cross( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", spark_logistic_regression)

Py4JJavaError: An error occurred while calling o193.fit.
: org.apache.spark.SparkException: Classification labels should be in [0 to 2561633]. Found 90 invalid labels.
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:563)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:487)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:278)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


### 70/30 split 

In [17]:
def model_test_split( data_x, data_y, dataset_name, algorithm): 
    algorithm_n = algorithm.__name__
  
    for chunk in data_chunks:
        # if chunk is greater than the no. of examples
        if chunk > sumdata_noise.shape[0]: 
            break

        X = data_x.head(n = chunk)
        y = data_y.head(n = chunk) 
        
        kf = KFold(n_splits=10)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
        
       
            # accumulating errors for each fold
        error = algorithm( X_train, X_test, y_train, y_test) 
        
        print ("Algorithm: {}  Data set: {} Chunk Size: {} Error: {}".format(algorithm_n,
                                                                             dataset_name,
                                                                             chunk, error))
        

In [18]:
model_test_split( sumdata_noise_X, sumdata_noise_reg_Y, "Sum data with noise", spark_logistic_regression)

  y = column_or_1d(y, warn=True)


Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100 Error: 0.12851116699238616
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 500 Error: 0.11938306709459479
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 1000 Error: 0.1053070868693961
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 5000 Error: 0.08468309994156151
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 10000 Error: 0.06733267756643017
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 50000 Error: 0.012541256149980499
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 100000 Error: 0.0020504251684084897
Algorithm: sklearn_linear_regression_sgd  Data set: Sum data with noise Chunk Size: 500000 Error: 0.0008324379449694423
