In [172]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn.linear_model  as sk  # namespace conflict with pyspark
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold , cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
from helper import *
import tensorflow as tf

from helper import get_news_dataset, get_data, create_classes

pd.options.display.max_columns = None
import findspark
import pyspark
from pyspark.sql import SQLContext
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.sql.functions import col

from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator

from keras.layers import Dense
from keras.models import Model, Sequential
from keras import initializers, optimizers
from keras.utils import np_utils
from keras import backend as K


# metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, accuracy_score 

## Import DataSet

In [173]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAD6JGlvG5XADIjg9SCojvpya/House%20Sales%20in%20King%20County%2C%20USA?dl=1&preview=kc_house_data.csv"
all_urls = [sumdata_url,housing_url]

In [174]:
get_data(all_urls) # retrieves the data if there is no data folder

In [175]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv" 
housing_price_path ="data/kc_house_data.csv" 

## Sum Data restricted to 100,000

In [176]:
SUM_DATA_SIZE = 100000

## Housing restricted to 10,000

In [177]:
HOUSE_DATA_SIZE = 10000

## Load datasets sum_noise

In [178]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")
sumdata_noise = sumdata_noise.head(n=SUM_DATA_SIZE)
sumdata_noise.head(n=1)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,62485,58472,84200,86181,75529,136939,150633,230058,246491,257336,1352179,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


### Create same base line for Spark, Keras, Sklearn implementation
- Using the same feature scaled data for all implementation
- Same number iterations
- Same cost function for minimising(for gradient descent)

In [179]:
# Remove 'Instance' as it simply represents the row number

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

sumdata_noise.drop('Instance', axis = 1)

# Extract 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise.loc[:, ['Noisy Target']] 

sumdata_noise_classif_Y = sumdata_noise.loc[:, ['Noisy Target Class']]
 
le.fit(sumdata_noise_classif_Y) 
sumdata_n_classes = len(le.classes_)
sumdata_noise_classif_Y['Noisy Target Class'] = le.transform(sumdata_noise_classif_Y) 
 
# Extract rest columns as explananatory variables
sumdata_noise_X =sumdata_noise.drop(['Instance','Noisy Target Class','Noisy Target'], axis = 1) 
 
sumdata_noise_classif_Y.rename(columns={'Noisy Target Class': 'label'}, inplace=True)



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


### Feature scaling
- Keeping them as dataframes (transform returns ndarray)

In [180]:
sumdata_n_classes

5

In [181]:
scX = StandardScaler()
scY = StandardScaler()
columns = sumdata_noise_X.columns

sumdata_noise_X[columns] = scX.fit_transform(sumdata_noise_X)
columns = sumdata_noise_reg_Y.columns
sumdata_noise_reg_Y[columns] = scX.fit_transform(sumdata_noise_reg_Y)


sumdata_noise_scaled = pd.concat([sumdata_noise_X,sumdata_noise_reg_Y])

## Load House Price dataset

In [217]:
housing_price = pd.read_csv(housing_price_path, delimiter=",")
housing_price = housing_price.head(n=HOUSE_DATA_SIZE)
housing_price.head(n=1)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650


## Preprocess housing price dataset

- Remove 'Id' as it simply represents the row number
- Use 'price' as regression target
- Use 'SaleCondition' as classification target
- For explananatory variables, use the following numerical variable and categorical variables
    - Numerical
        - bedrooms
        - bathrooms
        - sqft_living
        - sqft_lot
        - condition
        - sqft_above
        - yr_built
        - sqft_living15
        - sqft_lot15
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [218]:
#Use 'price' as regression target
HOUSING_NB_CLASSES = 2 # binary classifier
housing_price_reg_Y = housing_price.loc[:, ['price']]

# Use following features to predict price
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'condition', 'sqft_above', 'yr_built',
                      'sqft_living15', 'sqft_lot15']

housing_price[features[0]]
filtered_table = housing_price[features[0]]
filtered_table

for c in features[1:]:
    filtered_table = pd.concat([filtered_table, housing_price[c]], axis=1)
filtered_table

# get explanatory variable
housing_price_reg_X = filtered_table[:].values
housing_price_classif_X =filtered_table[:]


## get the regression target and classification target

# I am encoding price as categorical variable as the classification target, e.g try to determine the house condition
housing_price_classif_Y = np.zeros(housing_price.shape[0])
house_price_mean = housing_price['price'].values.mean()
for i, r in enumerate(housing_price['price']):
    if r > house_price_mean:
        housing_price_classif_Y[i] = 1
    else:
        housing_price_classif_Y[i] = 0
        
# I am using price as the regression target
housing_price_reg_Y = housing_price['price'].values.reshape(-1,1)




columns = filtered_table.columns

 

# Apply Feature Scaling to the classification variable
scX = StandardScaler()
scY = StandardScaler() 

housing_price_classif_X[columns] = scX.fit_transform(housing_price_reg_X[:])

housing_price_classif_Y =  pd.DataFrame(housing_price_classif_Y, columns=["label"])

In [219]:
sc = pyspark.SparkContext()
sqlContext = SQLContext(sc)


ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at <ipython-input-20-8030b2fc72d9>:1 

In [220]:
def sklearn_logistic_regression( X_train, X_test, y_train, y_test,_):
    lm = sk.LogisticRegression()
    lm.fit(X_train, y_train)
    accuracy= accuracy_score(y_test, lm.predict(X_test))
            
    precision = precision_score(y_test, lm.predict(X_test),average="weighted")
    return accuracy, precision

In [221]:
def spark_logistic_regression( X_train, X_test, y_train, y_test,_):
    """
        this uses the dataframe api
    """
    from pyspark.ml import Pipeline
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    # spark needs a target column
     
    
     
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    train_data = pd.concat([X_train,y_train], axis =1)
    test_data  = pd.concat([X_test,y_test], axis =1)
    
 
    train_data = sqlContext.createDataFrame(train_data)
    test_data = sqlContext.createDataFrame(test_data) 
   
   
    assembler = (VectorAssembler()
        .setInputCols(train_data.columns[:-1]) # everything excluding target
        .setOutputCol("features"))
 
    lr = LogisticRegression(maxIter=10, regParam=0.01)

    pipeline = Pipeline(stages=[assembler, lr])
    model = pipeline.fit(train_data)
    
    
    predictions = model.transform(test_data) 
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    return (accuracy, precision )


In [222]:
def precision(y_true, y_pred):
    """Precision metric.
    https://github.com/fchollet/keras/commit/a56b1a55182acf061b1eb2e2c86b48193a0e88f7
    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

In [235]:
#keras log reg
def keras1DLogReg(X_train, X_test, y_train, y_test,classes):
    
    from sklearn import preprocessing
    encoder = preprocessing.LabelEncoder()
    
    #Reshape and normalise inputs
    X_train = X_train.as_matrix()
    X_test = X_test.as_matrix()   
    y_train = y_train.as_matrix()
    y_test  = y_test.as_matrix()
    # converting to one-hot encoding
    y_train = np_utils.to_categorical(y_train,classes)
    y_test = np_utils.to_categorical(y_test,classes)
    
    
    #Build the model
    output_dim = y_train.shape[1]
    model = Sequential()
    model.add(Dense(output_dim, input_dim = X_train.shape[1], activation='softmax'))

   
    batch_size = 1
    epochs = 10

    
  
    #Compile ze model
    model.compile(optimizer = 'sgd', 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy', precision ])
    history = model.fit(X_train, 
                        y_train, 
                        batch_size = batch_size,
                        epochs = epochs,
                        verbose = 0)
     
    score = model.evaluate(X_test, y_test, verbose = 0)
    
    accuracy = score[0]
    precision1= score[1]
#     print ('Test score: ', score[0])
#     print ('Test accuracy', score[1])
     
    return (accuracy,precision1)

In [224]:
def model_cross_valdiation( X, y, dataset_name, algorithm, kfolds, classes=0): 
    
    kf = KFold(n_splits=kfolds)
    rmse = np.zeros((kfolds,1))
    mae = np.zeros((kfolds,1))
    accuracy = np.zeros((kfolds,1))
    precision = np.zeros((kfolds,1))
    
    algorithm_n = algorithm.__name__ 
 

    for (i, (train_index, test_index)) in enumerate(kf.split(X)):


        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        accuracy[i],precision[i] = algorithm( X_train, X_test, y_train, y_test,classes)
           
   
    print ("Dataset: {}\Accuracy: {}\nPrecision: {}\n".format(dataset_name, accuracy.mean(), precision.mean()))
    
          

## Sum data

In [197]:


model_cross_valdiation( sumdata_noise_X.head(n=100000), sumdata_noise_classif_Y.head(n=100000), "Sum data with noise", keras1DLogReg,10,sumdata_n_classes)


KeyboardInterrupt: 

In [236]:
model_cross_valdiation( sumdata_noise_X, sumdata_noise_classif_Y, "Sum data with noise", spark_logistic_regression,10)


Dataset: Sum data with noise\Accuracy: 0.95061
Precision: 0.932101779454789



In [237]:
model_cross_valdiation( sumdata_noise_X, sumdata_noise_classif_Y, "Sum data with noise", sklearn_logistic_regression,10)


  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)


Dataset: Sum data with noise\Accuracy: 0.95062
Precision: 0.9321175698636524



## House data

In [233]:


model_cross_valdiation( housing_price_classif_X,housing_price_classif_Y,"Housing data", keras1DLogReg,10,HOUSING_NB_CLASSES)


Dataset: Housing data\Accuracy: 0.4264479509592056
Precision: 0.8019000000000001



In [227]:
model_cross_valdiation( housing_price_classif_X,housing_price_classif_Y,"Housing data", spark_logistic_regression,10)


Dataset: Housing data\Accuracy: 0.8048
Precision: 0.8034502255729242



In [228]:
model_cross_valdiation( housing_price_classif_X,housing_price_classif_Y, "Housing data", sklearn_logistic_regression,10)


Dataset: Housing data\Accuracy: 0.8055
Precision: 0.8037219112739109



  y = column_or_1d(y, warn=True)


### 70/30 split 

In [151]:
def model_test_split( X, y, dataset_name, algorithm,classes=0): 
    algorithm_n = algorithm.__name__
    rmse = 0
    mae = 0 
    accuracy = 0
    precision = 0  

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

 
    
    accuracy ,precision  = algorithm( X_train, X_test, y_train, y_test,classes)
    # accumulating errors for each fold
    print ("Algorithm: {} Dataset: {}\Accuracy: {}\nPrecision: {}\n".format(
                                                            algorithm_n,
                                                            dataset_name,
                                                            accuracy,
                                                            precision))



## Sum data

In [154]:
## Takes too long to get a result

model_test_split( sumdata_noise_X, sumdata_noise_classif_Y, "Sum data with noise", keras1DLogReg,sumdata_n_classes)

TypeError: to_categorical() got an unexpected keyword argument 'nb_classes'

In [152]:
model_test_split( sumdata_noise_X, sumdata_noise_classif_Y, "Sum data with noise", spark_logistic_regression)

Algorithm: spark_logistic_regression Dataset: Sum data with noise\Accuracy: 0.9500333333333333
Precision: 0.9321881375486843



In [231]:
model_test_split( sumdata_noise_X, sumdata_noise_classif_Y,"Sum data with noise", sklearn_logistic_regression)

  y = column_or_1d(y, warn=True)


Algorithm: sklearn_logistic_regression Dataset: Sum data with noise\Accuracy: 0.9525666666666667
Precision: 0.9357732500867642



  'precision', 'predicted', average, warn_for)


## House data

In [234]:
model_test_split( housing_price_classif_X,housing_price_classif_Y, "House prices", keras1DLogReg,HOUSING_NB_CLASSES)

Algorithm: keras1DLogReg Dataset: House prices\Accuracy: 0.43398685812950133
Precision: 0.794666666507721



In [229]:
model_test_split( housing_price_classif_X,housing_price_classif_Y, "Housing prices", spark_logistic_regression)

Algorithm: spark_logistic_regression Dataset: Housing prices\Accuracy: 0.794
Precision: 0.792293964931692



In [230]:
model_test_split( housing_price_classif_X,housing_price_classif_Y, "Housing prices", sklearn_logistic_regression)

Algorithm: sklearn_logistic_regression Dataset: Housing prices\Accuracy: 0.8033333333333333
Precision: 0.8015483849789916



  y = column_or_1d(y, warn=True)
