In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn.linear_model  as sk  # namespace conflict with pyspark
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold , cross_val_score

import pandas as pd
from helper import *
import tensorflow as tf

pd.options.display.max_columns = None
import findspark
import pyspark
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import col

from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator



# Stuff that needs to be done
- Find another framework.
    - Sklearn
    - Spark
    - Tensorflow/R
- Find two metrics 
    - RMSE 
    - Some other
- Do 70/30 split training and 10 cross validation
    - only did cross validation
- Find another dataset 
    - Sum data with noise 
    -  Maybe housing data set
    

## Import DataSet

In [2]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_price_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAAVLZzU4E7ro0BiRzPG3pP8a/House%20Prices?dl=1"
all_urls = [sumdata_url, housing_price_url]

In [3]:
get_data(all_urls) # retrieves the data if there is no data folder

In [4]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv"
housing_price_path = "data/housing dataset.csv" # has more than 30 features
# need one more
# what a brilliant idea to name files with space

In [5]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

## Load datasets sum_noise

In [6]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")
sumdata_noise.head(n=1)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,62485,58472,84200,86181,75529,136939,150633,230058,246491,257336,1352179,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [33]:
# Remove 'Instance' as it simply represents the row number
sumdata_noise.drop('Instance', axis = 1)

# Extract 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise[['Noisy Target']]

# Extract 'Nosiy Target Class' as regression target
sumdata_noise_classif_Y = sumdata_noise['Noisy Target Class']

# Extract rest columns as explananatory variables
sumdata_noise_X = sumdata_noise.iloc[:, 1:-2]


### Linear Regression using pyspark

In [8]:
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)


In [28]:
from  pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [30]:
def spark_linear_regression_cross(data_df):
    
    
    input_features = list(sumdata_noise.columns)[:-2]
    ouput_label = list(sumdata_noise.columns)[-2]
    
    data = data_df.select(input_features)\
      .rdd\
      .map(lambda line:LabeledPoint(line[-1],line[1:-1]))\
      .toDF()
    
  
    as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())
    dataset = data.withColumn("features", as_ml("features"))
    
    lr = LinearRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [0,1]).build()
    evaluator = RegressionEvaluator(metricName="rmse")
    
    cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds = 10)
    
    cvModel = cv.fit(dataset)
    print("Root Mean Squared Error = " + str(cvModel.avgMetrics[0]))
    
  
    

##### http://www.techpoweredmath.com/spark-dataframes-mllib-tutorial/

In [32]:
def spark_linear_regression(data_df):
    input_features = list(sumdata_noise.columns)[:-2]
    ouput_label = list(sumdata_noise.columns)[-2]
    
    data = data_df.select(input_features)\
      .rdd\
      .map(lambda line:LabeledPoint(line[-1],line[1:-1]))\
      .toDF()
    
    # converting RDD to a dataframe
    as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())
    result = data.withColumn("features", as_ml("features"))
    result
    
    lr = LinearRegression()

    # Fit 2 models, using different regularization parameters
    modelA = lr.fit(result, {lr.regParam:0.0})
    modelB = lr.fit(result, {lr.regParam:100.0})
    
    
    predictionsA = modelA.transform(result)
    
    evaluator = RegressionEvaluator(metricName="rmse")
    RMSE = evaluator.evaluate(predictionsA)
    print("ModelA: Root Mean Squared Error = " + str(RMSE))
    
    predictionsB = modelB.transform(result)
    RMSE = evaluator.evaluate(predictionsB)
    print("ModelB: Root Mean Squared Error = " + str(RMSE))
    
    

In [31]:
for chunk in data_chunks:

    if chunk > sumdata_noise.shape[0]: # if chunk is greater than the no. of examples
        break
        
    data = sumdata_noise[:chunk]
    data.drop('Noisy Target Class', axis=1)
    data_df = sqlContext.createDataFrame(data)
     
    
    spark_linear_regression_cross(data_df)
    print("\n")


Root Mean Squared Error = 35076.84237834721


Root Mean Squared Error = 37001.44638245123


Root Mean Squared Error = 36494.68981887479


Root Mean Squared Error = 36988.15101109642


Root Mean Squared Error = 37165.84094858587


Root Mean Squared Error = 37236.66504471149


Root Mean Squared Error = 37265.16554135001


Root Mean Squared Error = 37393.852666480605




### Linear regression using scikit learn

In [22]:
def sklearn_linear_regression_cross( data_x, data_y,  data_set, algorithm=sk.LinearRegression): 
    
    algo_name = algorithm.__name__
    
    for chunk in data_chunks:

        if chunk > sumdata_noise.shape[0]: # if chunk is greater than the no. of examples
            break

        X = data_x.head(n = chunk)
        y = data_y.head(n = chunk)
        
        X = X.as_matrix()
        y = y.as_matrix() 
        kf = KFold(n_splits=10)
        errors = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            lm = sk.LinearRegression()
            model = lm.fit(X_train, y_train)
            predictions = lm.predict(X_test) 
        
            errors.append(mean_squared_error(y_test, predictions)**0.5)  # RMSE https://www.kaggle.com/wiki/RootMeanSquaredError
        #http://statweb.stanford.edu/~tibs/sta306bfiles/cvwrong.pdf
        
        error = sum(errors)/len(errors)
        print ("Data set: {} Algorithm: {}  Chunk Size: {} Error: {}".format(data_set,
                                                                          algo_name,
                                                                          chunk,
                                                                           error
                                                                         ))
 
              

In [None]:

sklearn_linear_regression_cross(sumdata_noise_X, sumdata_noise_reg_Y,"Noisy Sum", sk.LinearRegression )

### Logistic regression