In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
import pandas as pd
from helper import *
import tensorflow as tf

pd.options.display.max_columns = None
import findspark
import pyspark
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import col


## Import DataSet

In [2]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_price_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAAVLZzU4E7ro0BiRzPG3pP8a/House%20Prices?dl=1"
all_urls = [sumdata_url, housing_price_url]

In [3]:
get_data(all_urls) # retrieves the data if there is no data folder

In [4]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv"
housing_price_path = "data/housing dataset.csv" # has more than 30 features
# need one more
# what a brilliant idea to name files with space

In [5]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

## Load datasets sum_noise

In [6]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")
sumdata_noise.head(n=1)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,66957,74432,96087,103120,64272,150633,181787,180349,216912,304071,1434819,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [18]:
# Remove 'Instance' as it simply represents the row number
sumdata_noise.drop('Instance', axis = 1)

# Extract 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise['Noisy Target']

sumdata_noise_reg_Y = sumdata_noise_reg_Y.values.reshape(sumdata_noise_reg_Y.shape[0],1)
# Extract 'Nosiy Target Class' as regression target
sumdata_noise_classif_Y = sumdata_noise['Noisy Target Class']

# Extract rest columns as explananatory variables
sumdata_noise_X = sumdata_noise.iloc[:, 1:-2].as_matrix()

 = sumdata_noise.drop('Noisy Target Class', axis=1)[:100]
smalldata.head()

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target
0,1,66957,74432,96087,103120,64272,150633,181787,180349,216912,304071,1434819
1,2,96030,86875,108299,148025,16965,253819,258672,268851,404599,543092,2148748
2,3,26212,23398,27668,39678,23062,65873,65660,68508,82617,115418,476405
3,4,28363,33381,42447,35270,8980,52885,79144,85741,86806,147368,635169
4,5,38960,50255,79879,91885,64037,127193,115760,174069,184805,250659,1221471


In [8]:
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)


In [19]:
# converting pandas -> spark dataframe
 

#data_df = sqlContext.createDataFrame(sumdata_noise.drop('Noisy Target Class', axis=1))
data_df = sqlContext.createDataFrame(smalldata)

In [20]:
input_features = list(sumdata_noise.columns)[:-2]
ouput_label = list(sumdata_noise.columns)[-2]

##### http://www.techpoweredmath.com/spark-dataframes-mllib-tutorial/

In [35]:
data = data_df.select(input_features)\
  .rdd\
  .map(lambda line:LabeledPoint(line[-1],line[1:-1]))\
  .toDF()

In [36]:
# spark needs the dataframe to be labeled with "features" and "labels"
assembler = VectorAssembler(
    inputCols=input_features,
    outputCol="features")
transformed = assembler.transform(data_df)
from pyspark.mllib import linalg as mllib_linalg
from pyspark.ml import linalg as ml_linalg
from pyspark.ml.linalg import Vectors

def as_old(v):
    if isinstance(v, ml_linalg.SparseVector):
        return mllib_linalg.SparseVector(v.size, v.indices, v.values)
    if isinstance(v, ml_linalg.DenseVector):
        return  Vectors.sparse(v.values)
    raise ValueError("Unsupported type {0}".format(type(v)))

In [44]:
(transformed.select(col(ouput_label).alias("label"), col("features"))
  .rdd
  .map(lambda row: LabeledPoint(row.label, as_old(row.features))))

PythonRDD[122] at RDD at PythonRDD.scala:48

In [45]:
transformed

DataFrame[Instance: bigint, Feature 1: bigint, Feature 2: bigint, Feature 3: bigint, Feature 4: bigint, Feature 5 (meaningless): bigint, Feature 6: bigint, Feature 7: bigint, Feature 8: bigint, Feature 9: bigint, Feature 10: bigint, Noisy Target: bigint, features: vector]

In [42]:
lr = LinearRegression()

# Fit 2 models, using different regularization parameters
modelA = lr.fit(transformed, {lr.regParam:0.0})

IllegalArgumentException: 'Field "label" does not exist.'

In [None]:
from math import sqrt
def spark_linear_regression_sgd( X_train, X_test, y_train, y_test):
    """
        This is using RDD API, which is going to be deprecated 
    
    """
    train_data = pd.concat([X_train,y_train])
    test_data  = pd.concat([X_test,y_test])
    
    # converting them into spark dataframe
    train_data = sqlContext.createDataFrame(train_data)
    test_data = sqlContext.createDataFrame(test_data) 
    
    # converting
    train_data = train_data.rdd\
      .map(lambda line:LabeledPoint(line[-1],line[1:-1]))
    
    test_data = test_data.rdd\
        .map(lambda line:LabeledPoint(line[-1],line[1:-1]))
    
    lr = LinearRegressionWithSGD()

    # Fit 2 models, using different regularization parameters
    model = lr.train(train_data,iterations=10, step = 0.001)
    
    # Evaluate the model on test data
    valuesAndPreds = train_data.map(lambda p: (p.label, model.predict(p.features))) 
     
    RMSE = sqrt(valuesAndPreds\
        .map(diff_sq)\
        .reduce(sum_1) / valuesAndPreds.count()
              )
  

    return RMSE

In [None]:
def spark_linear_regression( X_train, X_test, y_train, y_test):
    """
        this uses the dataframe api
    """
    from pyspark.ml import Pipeline
   
    
    
    ## SHOULD FIND A BETTER WAY
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    train_data = pd.concat([X_train,y_train], axis =1)
    test_data  = pd.concat([X_test,y_test], axis =1)
    
 
    train_data = sqlContext.createDataFrame(train_data)
    test_data = sqlContext.createDataFrame(test_data) 
   
   
    assembler = (VectorAssembler()
        .setInputCols(train_data.columns[:-1]) # everything excluding target
        .setOutputCol("features"))
 
    lr = LogisticRegression(maxIter=10, regParam=0.01)

    pipeline = Pipeline(stages=[assembler, lr])
    model = pipeline.fit(train_data)
    
    
    predictions = model.transform(test_data) 
    evaluator = RegressionEvaluator(metricName="rmse")
    RMSE = evaluator.evaluate(predictions)
    
    return RMSE
    