In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import pandas as pd
from helper import *
from sklearn.preprocessing import LabelEncoder
pd.options.display.max_columns = None

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier

# MAKE SURE THE DATASETS ARE CORRECT

 ![caption](files/requirements.png)

## Import DataSet

In [2]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
census_url  = "https://www.dropbox.com/sh/euppz607r6gsen2/AADfweTUjMlkayMQ6JvKUOGia/Census-Income%20(KDD)%20Data%20Set?dl=1"
housing_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAD6JGlvG5XADIjg9SCojvpya/House%20Sales%20in%20King%20County%2C%20USA?dl=1&preview=kc_house_data.csv"
all_urls = [census_url,sumdata_url,housing_url]

In [3]:
get_data(all_urls) # retrieves the data if there is NO data folder

In [4]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv" 
housing_sales_path ="data/kc_house_data.csv"

## Load datasets sum_noise

In [6]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")

# Remove 'Instance' as it simply represents the row number
sumdata_noise = sumdata_noise.drop('Instance', axis = 1)
sumdata_noise.head(n=2)

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,62485,58472,84200,86181,75529,136939,150633,230058,246491,257336,1352179,Very Large Number
1,75559,119137,146760,139674,19582,177083,217746,321110,434444,516798,1976446,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [8]:
# Use 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise['Noisy Target'].values.reshape(-1, 1)

# Use 'Nosiy Target Class' Large Number as regression target
sumdata_noise_classif_Y = pd.get_dummies(sumdata_noise['Noisy Target Class']).iloc[:, 0]
sumdata_noise_classif_Y = sumdata_noise_classif_Y.values.astype(int).reshape(-1,1)

# Use rest columns as explananatory variables
# We can simply use the same features for both as Noisy Target and Noisy Target Class are representing the samething
sumdata_noise_reg_X = sumdata_noise.iloc[:, 0:-2].values
sumdata_noise_classif_X = sumdata_noise.iloc[:, 0:-2].values

# Apply Feature Scaling for the classification variable
# As we are using KNN 
from sklearn.preprocessing import StandardScaler
scX = StandardScaler()
sumdata_noise_classif_X = scX.fit_transform(sumdata_noise_classif_X)




## Load datasets sumdata

In [10]:
sumdata = pd.read_csv(sumdata_path, delimiter=";")

# Remove 'Instance' as it simply represents the row number
sumdata = sumdata.drop('Instance', axis = 1)
sumdata.head(n=2)

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Target,Target Class
0,57326,68791,82549,99059,72624,142645,171174,205409,246491,295789,1369233,Very Large Number
1,87859,105431,126517,151820,19982,218621,262345,314814,377777,453332,2098516,Very Large Number


## Preprocess sumdata dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest of the columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [11]:
# Use 'Nosiy Target' as regression target
sumdata_reg_Y = sumdata['Target'].values.reshape(-1, 1)

# Use 'Nosiy Target Class' Large Number as classification target
sumdata_classif_Y = pd.get_dummies(sumdata['Target Class']).iloc[:, 0]
sumdata_classif_Y = sumdata_classif_Y.values.astype(int).reshape(-1,1)

# Use rest columns as explananatory variables
# We can simply use the same features for both as Target and Target Class are representing the same thing
sumdata_classif_X = sumdata.iloc[:, 0:-2].values
sumdata_reg_X = sumdata.iloc[:, 0:-2].values

# Apply Feature Scaling for the classification variable
# As we are using KNN 
from sklearn.preprocessing import StandardScaler
scX = StandardScaler()
scY = StandardScaler()
sumdata_classif_X = scX.fit_transform(sumdata_classif_X)





array([[0],
       [0],
       [0],
       ..., 
       [0],
       [0],
       [1]])

 ## Load census dataset

In [9]:
census = get_census_dataset()
 

census.head(n=10)

Unnamed: 0,age,class of worker,industry code,occupation code,adjusted gross income,education,wage per hour,enrolled in edu inst last wk,marital status,major industry code,major occupation code,mace,hispanic Origin,sex,member of a labor union,reason for unemployment,full or part time employment stat,capital gains,capital losses,divdends from stocks,federal income tax liability,tax filer status,region of previous residence,state of previous residence,detailed household and family stat,detailed household summary in household,instance weight,migration code-change in msa,migration code-change in reg,migration code-move within reg,live in this house 1 year ago,migration prev res in sunbelt,num persons worked for employer,family members under 18,total person earnings,country of birth father,country of birth mother,country of birth self,citizenship,total person income,own business or self employed,taxable income amount,fill inc questionnaire for veteran's admin,veterans benefits,weeks worked in year
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Not in labor force,0,0,0,Nonfiler,Not in universe,Not in universe,Other Rel 18+ ever marr not in subfamily,Other relative of householder,1700.09,?,?,?,Not in universe under 1 year old,?,0,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.,,,
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,White,All other,Male,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Head of household,South,Arkansas,Householder,Householder,1053.55,MSA to MSA,Same county,Same county,No,Yes,1,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.,,,
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,Asian or Pacific Islander,All other,Female,Not in universe,Not in universe,Not in labor force,0,0,0,Nonfiler,Not in universe,Not in universe,Child 18+ never marr Not in a subfamily,Child 18 or older,991.95,?,?,?,Not in universe under 1 year old,?,0,Not in universe,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.,,,
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Nonfiler,Not in universe,Not in universe,Child <18 never marr not in subfamily,Child under 18 never married,1758.14,Nonmover,Nonmover,Nonmover,Yes,Not in universe,0,Both parents present,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.,,,
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Nonfiler,Not in universe,Not in universe,Child <18 never marr not in subfamily,Child under 18 never married,1069.16,Nonmover,Nonmover,Nonmover,Yes,Not in universe,0,Both parents present,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.,,,
5,48,Private,40,10,Some college but no degree,1200,Not in universe,Married-civilian spouse present,Entertainment,Professional specialty,Amer Indian Aleut or Eskimo,All other,Female,No,Not in universe,Full-time schedules,0,0,0,Joint both under 65,Not in universe,Not in universe,Spouse of householder,Spouse of householder,162.61,?,?,?,Not in universe under 1 year old,?,1,Not in universe,Philippines,United-States,United-States,Native- Born in the United States,2,Not in universe,2,52,95,- 50000.,,,
6,42,Private,34,3,Bachelors degree(BA AB BS),0,Not in universe,Married-civilian spouse present,Finance insurance and real estate,Executive admin and managerial,White,All other,Male,Not in universe,Not in universe,Children or Armed Forces,5178,0,0,Joint both under 65,Not in universe,Not in universe,Householder,Householder,1535.86,Nonmover,Nonmover,Nonmover,Yes,Not in universe,6,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.,,,
7,28,Private,4,40,High school graduate,0,Not in universe,Never married,Construction,Handlers equip cleaners etc,White,All other,Female,Not in universe,Job loser - on layoff,Unemployed full-time,0,0,0,Single,Not in universe,Not in universe,Secondary individual,Nonrelative of householder,898.83,?,?,?,Not in universe under 1 year old,?,4,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,30,95,- 50000.,,,
8,47,Local government,43,26,Some college but no degree,876,Not in universe,Married-civilian spouse present,Education,Adm support including clerical,White,All other,Female,No,Not in universe,Full-time schedules,0,0,0,Joint both under 65,Not in universe,Not in universe,Spouse of householder,Spouse of householder,1661.53,?,?,?,Not in universe under 1 year old,?,5,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,95,- 50000.,,,
9,34,Private,4,37,Some college but no degree,0,Not in universe,Married-civilian spouse present,Construction,Machine operators assmblrs & inspctrs,White,All other,Male,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Joint both under 65,Not in universe,Not in universe,Householder,Householder,1146.79,Nonmover,Nonmover,Nonmover,Yes,Not in universe,6,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.,,,


## Preprocess censu dataset

- Remove 'Id' as it simply represents the row number
- Use 'SalePrice' as regression target
- Use 'SaleCondition' as classification target
- For explananatory variables, use the following numerical variable and categorical variables
    - Numerical
        - LotFrontage
        - OverallQual
        - OverallCond
        - YearBuilt
        - YearRemodAdd
        - TotalBsmtSF
        - GrLivArea
        - FullBath
        - TotRmsAbvGrd
        - GarageYrBlt
        - GarageCars
        - GarageArea
        - LotArea
        - 1stFlrSF
        - 2ndFlrSF
    - Categorical
        - YearSold
        - Neighborhood
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [54]:
## filter unused categorical columns and drop NaN, we only want to keep 'neighborhood' and 'housestyle' 
## in our explanantory vairbale

# get all the numerical data that is needed
explanatory_numeric = ['OverallQual', 'LotFrontage', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', 'GrLivArea',
                      'FullBath', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'LotArea', '1stFlrSF', '2ndFlrSF']

housing_price[explanatory_numeric[0]]
filtered_table = housing_price[explanatory_numeric[0]]
filtered_table

for c in explanatory_numeric[1:]:
    filtered_table = pd.concat([filtered_table, housing_price[c]], axis=1)
filtered_table

# get the two categorical variable that we wants to use, convert it to dummy variable and avoid dummy variable trap
filtered_table = pd.concat([filtered_table, pd.get_dummies(housing_price['Neighborhood']).iloc[:, :-1]], axis=1)
filtered_table = pd.concat([filtered_table, pd.get_dummies(housing_price['YrSold']).iloc[:, :-1]], axis=1)

# drop NaN
filtered_table = filtered_table.dropna()
filtered_table 

# get explanatory variable
housing_price_reg_X = filtered_table[:].values
housing_price_classif_X = filtered_table[:].values

# get the regression target and classification target
# I am using sales price as both regression and classification target
# sale_price_classi = SalePrice<Mean, SalePrice>=Mean
housing_price_reg_Y = housing_price['SalePrice'].values
housing_price_classif_Y = np.zeros(housing_price_reg_Y.shape)
house_price_mean = housing_price_reg_Y.mean()

for i, price in enumerate(housing_price_reg_Y):
    if price >= house_price_mean:
        housing_price_classi_Y[i] = 1
    else:
        housing_price_classi_Y[i] = 0

# Apply Feature Scaling to the numeric variables and regression target 
scX = StandardScaler()
scY = StandardScaler()
housing_price_classif_X = scX.fit_transform(housing_price_classif_X)
housing_price_classif_Y = scY.fit_transform(housing_price_classif_Y.reshape(-1, 1))




# Fits Algorithms to datasets

In [57]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

In [58]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from math import sqrt
def root_mean_square_error(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [59]:
def model( X, y, dataset_name, algorithm, isReg): 
    
    print ("Algorithm: {}\nDataset: {}\n".format( algorithm.__name__, dataset_name))
    for chunk in data_chunks:
        
        # if chunk is greater than the no. of examples quite from the chunking
        if chunk > X.shape[0]: 
            chunk = X.shape[0]
        
        print ("Chunk Size: {}\n".format(chunk))
        
        # generate the chunk file
        current_X = X[0:chunk]
        current_y = y[0:chunk]
        
        kFoldModelling(current_X, current_y, 10, algorithm, isReg)
        
        if chunk == X.shape[0]:
            break
              
         
    

In [60]:
def kFoldModelling (X, y, kfolds, algorithm, isReg):
    
    kf = KFold(n_splits=kfolds)
    rmse = np.zeros((10,1))
    mae = np.zeros((10,1))
    accuracy = np.zeros((10,1))
    precision = np.zeros((10,1))
    
    for i, (train_index, test_index) in enumerate(kf.split(X)):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # fit the model to the datset
        lm = algorithm()
        lm.fit(X_train, y_train)
        
        if isReg:     
            rmse[i] = root_mean_square_error(y_test, lm.predict(X_test))  # RMSE https://www.kaggle.com/wiki/RootMeanSquaredError
            mae[i] = mean_absolute_error(y_test, lm.predict(X_test))
        else:
            from sklearn.metrics import accuracy_score
            accuracy[i] = accuracy_score(y_test, lm.predict(X_test))
            precision[i] = precision_score(y_test, lm.predict(X_test))
        # print the result, we will need to have method that genereates the result csv file required.
    
    if isReg:     
        print ("Iteration: {}\nRMSE: {}\nMAE: {}\n".format( i, rmse.mean(), mae.mean()))
    else:
        print ("Iteration: {}\Accuracy: {}\nPrecision: {}\n".format( i, accuracy.mean(), precision.mean()))
    

## Fits Regression Algorithms to datasets

    - Linear Regression
    - Random Forest Regression
    

In [62]:
# model(sumdata_noise_reg_X, sumdata_noise_reg_Y, "The Sum Dataset(with noise)", LinearRegression, True)
# model(sumdata_reg_X, sumdata_reg_Y, "The Sum Dataset(without noise)",  LinearRegression, True)
# model(housing_price_reg_X, housing_price_reg_Y, "Housing Dataset",  LinearRegression, True)
# model(titanic_regression_X, titanic_regression_Y, "Titanic Dataset", LinearRegression, True)

# model(sumdata_noise_reg_X, sumdata_noise_reg_Y,"The Sum Dataset(with noise)",  RandomForestRegressor, True)
# model(sumdata_reg_X, sumdata_reg_Y, "The Sum Dataset(without noise)", RandomForestRegressor, True)
# model(housing_price_reg_X, housing_price_reg_Y, "Housing Dataset", RandomForestRegressor, True)
# model(titanic_regression_X, titanic_regression_Y, "Titanic Dataset", RandomForestRegressor, True)

from sklearn.linear_model import LogisticRegression
# model(sumdata_noise_classif_X, sumdata_noise_classif_Y, "The Sum Dataset(with noise)", LogisticRegression, False)
# model(sumdata_classif_X, sumdata_classif_Y, "The Sum Dataset(without noise)", LogisticRegression, False)
# model(housing_price_classif_X, housing_price_classif_Y, "Housing Dataset", LogisticRegression, False)
housing_price_classif_Y
# model(titanic_classification_X, titanic_classification_y, "Titanic Dataset", LogisticRegression, False)

# from sklearn.svm import SVC
# model(sumdata_noise_classi_X, sumdata_noise_classif_Y, "The Sum Dataset(with noise)", SVC, False)
# model(sumdata_classi_X, sumdata_classif_Y, "The Sum Dataset(without noise)", SVC, False)
# model(housing_price_classi_X, housing_price_classif_Y, "Housing Dataset", SVC, False)
# model(titanic_classification_X, titanic_classification_y, "Titanic Dataset", SVC, False)


array([[ 0.],
       [ 0.],
       [ 0.],
       ..., 
       [ 0.],
       [ 0.],
       [ 0.]])