In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import pandas as pd
from helper import *
from sklearn.preprocessing import LabelEncoder
pd.options.display.max_columns = None

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier

## Import DataSet

In [2]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_price_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAAVLZzU4E7ro0BiRzPG3pP8a/House%20Prices?dl=1"
titanic_data_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AADfweTUjMlkayMQ6JvKUOGia/Wine%20Quality%20Ratings%20and%20Chemicals?dl=0"
all_urls = [sumdata_url, housing_price_url, titanic_data_url]

In [3]:
get_data(all_urls) # retrieves the data if there is no data folder

In [4]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv"
housing_price_path = "data/housing dataset.csv" # has more than 30 features
titanic_path = "data/titanic dataset.csv"
# census_path = "data/adult.csv"
# need one more
# what a brilliant idea to name files with space

## Load datasets sum_noise

In [5]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")

# Remove 'Instance' as it simply represents the row number
sumdata_noise = sumdata_noise.drop('Instance', axis = 1)
sumdata_noise.head(n=10)

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,62485,58472,84200,86181,75529,136939,150633,230058,246491,257336,1352179,Very Large Number
1,75559,119137,146760,139674,19582,177083,217746,321110,434444,516798,1976446,Very Large Number
2,26568,30742,28693,36891,24065,60797,79331,84997,116275,129739,558391,Very Large Number
3,28982,26455,30320,36811,6298,51776,60652,92309,104381,102250,517918,Very Large Number
4,48819,64900,76502,66164,52350,142914,161493,174927,187274,283998,1255271,Very Large Number
5,28229,41920,53861,59145,61156,74633,100096,140345,160827,158400,792932,Very Large Number
6,13367,15027,18032,25529,1464,35011,47895,59491,50819,70421,322168,Large Number
7,32997,37556,39679,69952,34100,94806,95483,136521,144810,166752,875855,Very Large Number
8,37107,37107,47671,55948,33723,76945,92334,114712,140783,191464,746427,Very Large Number
9,20694,27936,42090,50509,105598,54067,86508,77857,93428,114783,550836,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [6]:
# Use 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise['Noisy Target'].values.reshape(-1, 1)

# Use 'Nosiy Target Class' Large Number as regression target
sumdata_noise_classif_Y = pd.get_dummies(sumdata_noise['Noisy Target Class']).iloc[:, 0]
sumdata_noise_classif_Y = sumdata_noise_classif_Y.values.astype(int).reshape(-1,1)

# Use rest columns as explananatory variables
# We can simply use the same features for both as Noisy Target and Noisy Target Class are representing the samething
sumdata_noise_reg_X = sumdata_noise.iloc[:, 0:-2].values
sumdata_noise_classif_X = sumdata_noise.iloc[:, 0:-2].values

# Apply Feature Scaling for the classification variable
# As we are using KNN 
from sklearn.preprocessing import StandardScaler
scX = StandardScaler()
sumdata_noise_classif_X = scX.fit_transform(sumdata_noise_classif_X)
sumdata_noise_classif_Y



array([[0],
       [0],
       [0],
       ..., 
       [0],
       [0],
       [1]])

## Load datasets sumdata

In [7]:
sumdata = pd.read_csv(sumdata_path, delimiter=";")

# Remove 'Instance' as it simply represents the row number
sumdata = sumdata.drop('Instance', axis = 1)
sumdata.head(n=10)

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Target,Target Class
0,57326,68791,82549,99059,72624,142645,171174,205409,246491,295789,1369233,Very Large Number
1,87859,105431,126517,151820,19982,218621,262345,314814,377777,453332,2098516,Very Large Number
2,23721,28465,34158,40990,20054,59026,70831,84997,101996,122395,566579,Very Large Number
3,24771,29725,35670,42804,7775,61638,73966,88759,106511,127813,591657,Very Large Number
4,47862,57434,68921,82705,60872,119095,142914,171497,205796,246955,1143179,Very Large Number
5,35286,42343,50812,60974,51392,87803,105364,126437,151724,182069,842812,Very Large Number
6,14070,16884,20261,24313,1509,35011,42013,50416,60499,72599,336066,Large Number
7,34018,40822,48986,58783,38750,84648,101578,121894,146273,175528,812530,Very Large Number
8,36379,43655,52386,62863,29843,90523,108628,130354,156425,187710,868923,Very Large Number
9,25867,31040,37248,44698,92630,64366,77239,92687,111224,133469,617838,Very Large Number


## Preprocess sumdata dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest of the columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [8]:
# Use 'Nosiy Target' as regression target
sumdata_reg_Y = sumdata['Target'].values.reshape(-1, 1)

# Use 'Nosiy Target Class' Large Number as classification target
sumdata_classif_Y = pd.get_dummies(sumdata['Target Class']).iloc[:, 0]
sumdata_classif_Y = sumdata_classif_Y.values.astype(int).reshape(-1,1)

# Use rest columns as explananatory variables
# We can simply use the same features for both as Target and Target Class are representing the same thing
sumdata_classif_X = sumdata.iloc[:, 0:-2].values
sumdata_reg_X = sumdata.iloc[:, 0:-2].values

# Apply Feature Scaling for the classification variable
# As we are using KNN 
from sklearn.preprocessing import StandardScaler
scX = StandardScaler()
scY = StandardScaler()
sumdata_classif_X = scX.fit_transform(sumdata_classif_X)
sumdata_classif_Y




array([[0],
       [0],
       [0],
       ..., 
       [0],
       [0],
       [1]])

 ## Load House Price dataset

In [9]:
housing_price = pd.read_csv(housing_price_path)

# Remove 'Instance' as it simply represents the row number
housing_price = housing_price.drop('Id', axis = 1)
housing_price.head(n=10)

FileNotFoundError: File b'data/housing dataset.csv' does not exist

## Preprocess housing price dataset

- Remove 'Id' as it simply represents the row number
- Use 'SalePrice' as regression target
- Use 'SaleCondition' as classification target
- For explananatory variables, use the following numerical variable and categorical variables
    - Numerical
        - LotFrontage
        - OverallQual
        - OverallCond
        - YearBuilt
        - YearRemodAdd
        - TotalBsmtSF
        - GrLivArea
        - FullBath
        - TotRmsAbvGrd
        - GarageYrBlt
        - GarageCars
        - GarageArea
        - LotArea
        - 1stFlrSF
        - 2ndFlrSF
    - Categorical
        - YearSold
        - Neighborhood
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [10]:
## filter unused categorical columns and drop NaN, we only want to keep 'neighborhood' and 'housestyle' 
## in our explanantory vairbale

# get all the numerical data that is needed
explanatory_numeric = ['OverallQual', 'LotFrontage', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', 'GrLivArea',
                      'FullBath', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'LotArea', '1stFlrSF', '2ndFlrSF']

housing_price[explanatory_numeric[0]]
filtered_table = housing_price[explanatory_numeric[0]]
filtered_table

for c in explanatory_numeric[1:]:
    filtered_table = pd.concat([filtered_table, housing_price[c]], axis=1)
filtered_table

# get the two categorical variable that we wants to use, convert it to dummy variable and avoid dummy variable trap
filtered_table = pd.concat([filtered_table, pd.get_dummies(housing_price['Neighborhood']).iloc[:, :-1]], axis=1)
filtered_table = pd.concat([filtered_table, pd.get_dummies(housing_price['YrSold']).iloc[:, :-1]], axis=1)

# drop NaN
filtered_table = filtered_table.dropna()
filtered_table 

# get explanatory variable
housing_price_reg_X = filtered_table[:].values
housing_price_classif_X = filtered_table[:].values

# get the regression target and classification target
# I am using sales price as both regression and classification target
# sale_price_classi = SalePrice<Mean, SalePrice>=Mean
housing_price_reg_Y = housing_price['SalePrice'].values
housing_price_classif_Y = np.zeros(housing_price_reg_Y.shape)
house_price_mean = housing_price_reg_Y.mean()

for i, price in enumerate(housing_price_reg_Y):
    if price >= house_price_mean:
        housing_price_classi_Y[i] = 1
    else:
        housing_price_classi_Y[i] = 0

# Apply Feature Scaling to the numeric variables and regression target 
scX = StandardScaler()
scY = StandardScaler()
housing_price_classif_X = scX.fit_transform(housing_price_classif_X)
housing_price_classif_Y = scY.fit_transform(housing_price_classif_Y.reshape(-1, 1))




NameError: name 'housing_price' is not defined

## Load titanic dataset

In [55]:
titanic_dataset = pd.read_csv(titanic_path)

# Remove 'PassengerId' as it simply represents the row number
titanic_dataset = titanic_dataset.drop('PassengerId', axis = 1)
titanic_dataset = titanic_dataset.dropna()
titanic_dataset.head(n=10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S
21,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S
23,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S
27,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
52,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C
54,0,1,"Ostby, Mr. Engelhart Cornelius",male,65.0,0,1,113509,61.9792,B30,C



## Preprocess titanic dataset

- Use 'Fare' as regression target
    - explanatory variable
        - Age
        
- Use 'Survived' as classification target
    - explanatory variable
        - Sex, female get the chance to survive
        - Fare, rich people might get the chance to survive
        - Age, will be divided as >12 and (<12), child get the chance to survive
        
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [56]:
# Collect varaibles needed for regression
titanic_regression_X = titanic_dataset['Age'].values.reshape(-1, 1)
titanic_regression_Y = titanic_dataset['Fare'].values.reshape(-1,1)

scX = StandardScaler()
scY = StandardScaler()
titanic_regression_X = scX.fit_transform(titanic_regression_X)
titanic_regression_Y = scX.fit_transform(titanic_regression_Y)

# Collect varaibles needed for classification
titanic_classification_X = titanic_dataset['Fare']
titanic_classification_X = pd.concat([titanic_classification_X, pd.get_dummies(titanic_dataset['Sex']).iloc[:, :-1]], axis=1)
titanic_classification_X = pd.concat([titanic_classification_X, titanic_dataset['Age']], axis=1)
titanic_classification_y = titanic_dataset['Survived'].values.reshape(-1,1)

# Apply Feature Scaling to the explanatory variables for classification
scX = StandardScaler()
titanic_classification_X = scX.fit_transform(titanic_classification_X)
titanic_classification_X.shape


(183, 3)

# Fits Algorithms to datasets

In [57]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

In [58]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from math import sqrt
def root_mean_square_error(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [59]:
def model( X, y, dataset_name, algorithm, isReg): 
    
    print ("Algorithm: {}\nDataset: {}\n".format( algorithm.__name__, dataset_name))
    for chunk in data_chunks:
        
        # if chunk is greater than the no. of examples quite from the chunking
        if chunk > X.shape[0]: 
            chunk = X.shape[0]
        
        print ("Chunk Size: {}\n".format(chunk))
        
        # generate the chunk file
        current_X = X[0:chunk]
        current_y = y[0:chunk]
        
        kFoldModelling(current_X, current_y, 10, algorithm, isReg)
        
        if chunk == X.shape[0]:
            break
              
         
    

In [60]:
def kFoldModelling (X, y, kfolds, algorithm, isReg):
    
    kf = KFold(n_splits=kfolds)
    rmse = np.zeros((10,1))
    mae = np.zeros((10,1))
    accuracy = np.zeros((10,1))
    precision = np.zeros((10,1))
    
    for i, (train_index, test_index) in enumerate(kf.split(X)):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # fit the model to the datset
        lm = algorithm()
        lm.fit(X_train, y_train)
        
        if isReg:     
            rmse[i] = root_mean_square_error(y_test, lm.predict(X_test))  # RMSE https://www.kaggle.com/wiki/RootMeanSquaredError
            mae[i] = mean_absolute_error(y_test, lm.predict(X_test))
        else:
            from sklearn.metrics import accuracy_score
            accuracy[i] = accuracy_score(y_test, lm.predict(X_test))
            precision[i] = precision_score(y_test, lm.predict(X_test))
        # print the result, we will need to have method that genereates the result csv file required.
    
    if isReg:     
        print ("Iteration: {}\nRMSE: {}\nMAE: {}\n".format( i, rmse.mean(), mae.mean()))
    else:
        print ("Iteration: {}\Accuracy: {}\nPrecision: {}\n".format( i, accuracy.mean(), precision.mean()))
    

## Fits Regression Algorithms to datasets

    - Linear Regression
    - Random Forest Regression
    

In [62]:
# model(sumdata_noise_reg_X, sumdata_noise_reg_Y, "The Sum Dataset(with noise)", LinearRegression, True)
# model(sumdata_reg_X, sumdata_reg_Y, "The Sum Dataset(without noise)",  LinearRegression, True)
# model(housing_price_reg_X, housing_price_reg_Y, "Housing Dataset",  LinearRegression, True)
# model(titanic_regression_X, titanic_regression_Y, "Titanic Dataset", LinearRegression, True)

# model(sumdata_noise_reg_X, sumdata_noise_reg_Y,"The Sum Dataset(with noise)",  RandomForestRegressor, True)
# model(sumdata_reg_X, sumdata_reg_Y, "The Sum Dataset(without noise)", RandomForestRegressor, True)
# model(housing_price_reg_X, housing_price_reg_Y, "Housing Dataset", RandomForestRegressor, True)
# model(titanic_regression_X, titanic_regression_Y, "Titanic Dataset", RandomForestRegressor, True)

from sklearn.linear_model import LogisticRegression
# model(sumdata_noise_classif_X, sumdata_noise_classif_Y, "The Sum Dataset(with noise)", LogisticRegression, False)
# model(sumdata_classif_X, sumdata_classif_Y, "The Sum Dataset(without noise)", LogisticRegression, False)
# model(housing_price_classif_X, housing_price_classif_Y, "Housing Dataset", LogisticRegression, False)
housing_price_classif_Y
# model(titanic_classification_X, titanic_classification_y, "Titanic Dataset", LogisticRegression, False)

# from sklearn.svm import SVC
# model(sumdata_noise_classi_X, sumdata_noise_classif_Y, "The Sum Dataset(with noise)", SVC, False)
# model(sumdata_classi_X, sumdata_classif_Y, "The Sum Dataset(without noise)", SVC, False)
# model(housing_price_classi_X, housing_price_classif_Y, "Housing Dataset", SVC, False)
# model(titanic_classification_X, titanic_classification_y, "Titanic Dataset", SVC, False)


array([[ 0.],
       [ 0.],
       [ 0.],
       ..., 
       [ 0.],
       [ 0.],
       [ 0.]])