In [95]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import pandas as pd
from helper import *
pd.options.display.max_columns = None

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

## Import DataSet

In [96]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_price_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAAVLZzU4E7ro0BiRzPG3pP8a/House%20Prices?dl=1"
census_data_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AADfweTUjMlkayMQ6JvKUOGia/Census-Income%20(KDD)%20Data%20Set?dl=0"
all_urls = [sumdata_url, housing_price_url, census_data_url]

In [97]:
get_data(all_urls) # retrieves the data if there is no data folder

In [98]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv"
housing_price_path = "data/housing dataset.csv" # has more than 30 features
census_path = "data/housing dataset.csv"
titanic_path = "data/"
# need one more
# what a brilliant idea to name files with space

## Load datasets sum_noise

In [99]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")
sumdata_noise

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,66957,74432,96087,103120,64272,150633,181787,180349,216912,304071,1434819,Very Large Number
1,2,96030,86875,108299,148025,16965,253819,258672,268851,404599,543092,2148748,Very Large Number
2,3,26212,23398,27668,39678,23062,65873,65660,68508,82617,115418,476405,Very Large Number
3,4,28363,33381,42447,35270,8980,52885,79144,85741,86806,147368,635169,Very Large Number
4,5,38960,50255,79879,91885,64037,127193,115760,174069,184805,250659,1221471,Very Large Number
5,6,34616,45307,50710,58657,58330,79111,99885,111644,140952,210654,794117,Very Large Number
6,7,12888,20126,22753,23219,1766,35151,40627,43459,65339,78915,363026,Large Number
7,8,30922,38291,43157,53434,33790,95398,109501,114824,168799,160433,813129,Very Large Number
8,9,33869,47235,53800,69464,35334,84096,119056,141564,137810,151482,914668,Very Large Number
9,10,28247,24863,41941,38753,101615,56320,80638,103624,132245,149352,696654,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [100]:
# Remove 'Instance' as it simply represents the row number
sumdata_noise.drop('Instance', axis = 1)

# Use 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise['Noisy Target'].values.reshape(-1, 1)

# Use 'Nosiy Target Class' as regression target
sumdata_noise_classif_Y = sumdata_noise['Noisy Target Class']

# Use rest columns as explananatory variables
sumdata_noise_X = sumdata_noise.iloc[:, 0:-2]

# Apply Feature Scaling to the dataset 
from sklearn.preprocessing import StandardScaler
scX = StandardScaler()
scY = StandardScaler()
sumdata_noise_X = scX.fit_transform(sumdata_noise_X)
sumdata_noise_reg_Y = scY.fit_transform(sumdata_noise_reg_Y)

sumdata_noise_X



array([[-1.73204902,  0.57148372,  0.40463575, ...,  0.01029815,
         0.01473561,  0.29953474],
       [-1.73204544,  1.55295829,  0.75447754, ...,  0.8441455 ,
         1.48881339,  1.86218921],
       [-1.73204186, -0.80402556, -1.03021326, ..., -1.04344437,
        -1.04000593, -0.93382741],
       ..., 
       [ 1.73204186,  0.03042869,  0.34654902, ...,  0.76615191,
         0.79476291,  0.37581042],
       [ 1.73204544,  0.73163628,  0.32251022, ...,  0.86467558,
         1.10936719,  0.36614112],
       [ 1.73204902, -0.98058498, -1.05430829, ..., -1.14767765,
        -1.17865895, -1.17127118]])

## Load datasets sumdata

In [101]:
sumdata = pd.read_csv(sumdata_path, delimiter=";")
sumdata.head(n=500)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Target,Target Class
0,1,57326,68791,82549,99059,72624,142645,171174,205409,246491,295789,1073444,Very Large Number
1,2,87859,105431,126517,151820,19982,218621,262345,314814,377777,453332,1645184,Very Large Number
2,3,23721,28465,34158,40990,20054,59026,70831,84997,101996,122395,444184,Large Number
3,4,24771,29725,35670,42804,7775,61638,73966,88759,106511,127813,463844,Large Number
4,5,47862,57434,68921,82705,60872,119095,142914,171497,205796,246955,896224,Very Large Number
5,6,35286,42343,50812,60974,51392,87803,105364,126437,151724,182069,660743,Very Large Number
6,7,14070,16884,20261,24313,1509,35011,42013,50416,60499,72599,263467,Large Number
7,8,34018,40822,48986,58783,38750,84648,101578,121894,146273,175528,637002,Very Large Number
8,9,36379,43655,52386,62863,29843,90523,108628,130354,156425,187710,681213,Very Large Number
9,10,25867,31040,37248,44698,92630,64366,77239,92687,111224,133469,484369,Large Number


## Preprocess sumdata dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest of the columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [102]:
# Remove 'Instance' as it simply represents the row number
sumdata.drop('Instance', axis = 1)

# Use 'Nosiy Target' as regression target
sumdata_reg_Y = sumdata['Target'].values.reshape(-1, 1)

# Use 'Nosiy Target Class' as classification target
sumdata_classif_Y = sumdata['Target Class']

# Use rest columns as explananatory variables
sumdata_X = sumdata.iloc[:, 0:-2].values
# Apply Feature Scaling to the dataset 
from sklearn.preprocessing import StandardScaler
scX = StandardScaler()
scY = StandardScaler()
sumdata_X = scX.fit_transform(sumdata_X)
sumdata_reg_Y = scY.fit_transform(sumdata_reg_Y)
sumdata_X.head(n=500)




AttributeError: 'numpy.ndarray' object has no attribute 'head'

## Load housing price dataset

In [None]:
housing_price = pd.read_csv(housing_price_path)
housing_price = housing_price
housing_price

## Preprocess housing price dataset

- Remove 'Id' as it simply represents the row number
- Use 'SalePrice' as regression target
- Use 'SaleCondition' as classification target
- For explananatory variables, use the following numerical variable and categorical variables
    - Numerical
        - GarageArea
        - GarageCars
        - LotArea
        - YearBuilt
        - 1stFlrSF
        - 2ndFlrSF
    - Categorical
        - Neighborhood, as the career of the neighbour impacts on factors such as the consuming power of the area 
        - HouseStyle, well-designed house is more expensive
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [None]:
# Remove 'Instance' as it simply represents the row number
housing_price.drop('Id', axis = 1)

## filter unused categorical columns and drop NaN, we only want to keep 'neighborhood' and 'housestyle' 
## in our explanantory vairbale

# get all the numerical data
filtered_table = housing_price['GarageArea']
filtered_table = pd.concat([filtered_table, housing_price['GarageCars']], axis=1)
filtered_table = pd.concat([filtered_table, housing_price['LotArea']], axis=1)
filtered_table = pd.concat([filtered_table, housing_price['YearBuilt']], axis=1)
filtered_table = pd.concat([filtered_table, housing_price['1stFlrSF']], axis=1)
filtered_table = pd.concat([filtered_table, housing_price['2ndFlrSF']], axis=1)
filtered_table

# get the two categorical variable that we wants to use
filtered_table = pd.concat([filtered_table, pd.get_dummies(housing_price['Neighborhood']).iloc[:, :-1]], axis=1)
filtered_table = pd.concat([filtered_table, pd.get_dummies(housing_price['HouseStyle']).iloc[:, :-1]], axis=1)

# get the regression target and classification target
filtered_table = pd.concat([filtered_table, housing_price['SalePrice']], axis=1)
filtered_table = pd.concat([filtered_table, housing_price['SaleCondition']], axis=1)

# drop NaN
filtered_table = filtered_table.dropna()

# Use 'Nosiy Target' as regression target
housing_price_reg_Y = filtered_table['SalePrice'].reshape(-1, 1)

# Use 'Nosiy Target Class' as classification target
housing_price_classif_Y = filtered_table['SaleCondition'].reshape(-1, 1)

# Get other columns as the explanatory variables
filtered_table = filtered_table.drop('SalePrice', 1)
housing_price_X = filtered_table.drop('SaleCondition', 1)
housing_price_X

# Apply Feature Scaling to the numeric variables and regression target 
scX = StandardScaler()
scY = StandardScaler()
housing_price_X = scX.fit_transform(housing_price_X)
housing_price_reg_Y = scY.fit_transform(housing_price_reg_Y)
housing_price_classif_Y = housing_price_classif_Y

housing_price_X

# Fits Algorithms to datasets

In [None]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def root_mean_square_error(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [None]:
def model( X, y, dataset_name, algorithm): 
    
    algo_name = algorithm.__name__
    for chunk in data_chunks:
        
        # if chunk is greater than the no. of examples quite from the chunking
        if chunk > X.shape[0]: 
            chunk = X.shape[0]
        
        # generate the chunk file
        X = X[:, 0:chunk]
        y = y[:, 0:chunk]
        
        # fits algorithm to the dateset chunk
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        lm = algorithm()
        model = lm.fit(X_train, y_train)
        
        # do the prediction
        predictions = model.predict(X_test)
        # Validate the prediction using mean square error        
        error = root_mean_square_error(y_test, predictions)  # RMSE https://www.kaggle.com/wiki/RootMeanSquaredError
        
        # print the result, we will need to have method that genereates the result csv file required.
        print ("Data set: {}\nAlgorithm: {}\nChunk Size: {}\nRMSE: {}".format(dataset_name,
                                                                          algo_name,
                                                                          chunk,
                                                                           error
                                                                         ))
        if chunk == X.shape[0]:
            break
              
         
    

In [103]:
def kFoldModel (X, y, dataset_name, kfolds, algorithm=LinearRegression):
    
    algo_name = algorithm.__name__
    
    kf = KFold(n_splits=kfolds)
    
    for train, test in kf.split(X, y):
        model(X,y, dataset_name, algorithm)

## Fits Regression Algorithms to datasets

    - Linear Regression
    - Random Forest Regression
    

In [104]:
# kFoldModel(sumdata_noise_X, sumdata_noise_reg_Y, "The Sum Dataset(with noise)", 10, LinearRegression)
# kFoldModel(sumdata_X, sumdata_reg_Y, "The Sum Dataset(without noise)", 10,  LinearRegression )
kFoldModel(housing_price_X, housing_price_reg_Y, "Housing Dataset", 10, LinearRegression )

housing_price_X.shape
# kFoldModel(sumdata_noise_X, sumdata_noise_reg_Y,"The Sum Dataset(with noise)",10,  RandomForestRegressor )
# kFoldModel(sumdata_X, sumdata_reg_Y, "The Sum Dataset(with noise)",10,  RandomForestRegressor )
# kFoldModel(housing_price_X, housing_price_reg_Y, "The Sum Dataset(with noise)",10, RandomForestRegressor )

Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 100
RMSE: 0.5199576276043364
Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 500
RMSE: 0.43896260638927687
Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 1000
RMSE: 0.7452794230502054
Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 100
RMSE: 0.43294755438782156
Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 500
RMSE: 0.5575951248008337
Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 1000
RMSE: 0.39681776175065553
Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 100
RMSE: 0.5624406968139155
Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 500
RMSE: 0.5676502554628778
Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 1000
RMSE: 0.49907245251138277
Data set: Housing Dataset
Algorithm: LinearRegression
Chunk Size: 100
RMSE: 0.3929855793545279
Data set: Housing Dataset
Algorithm: Linear

(1460, 37)