In [3]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
from helper import *

#### Data urls

In [4]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
housing_price_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAAVLZzU4E7ro0BiRzPG3pP8a/House%20Prices?dl=1"
all_urls = [sumdata_url, housing_price_url]

In [5]:
get_data(all_urls) # retrieves the data if there is no data folder

In [12]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv"
housing_price_path = "data/housing dataset.csv" # has more than 30 features
# need one more
# what a brilliant idea to name files with space

In [13]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

## Load data sets sum_noise, sumdata, house_price

In [14]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";")

In [15]:
sumdata = pd.read_csv(sumdata_path, delimiter=";")

In [16]:
housing_price = pd.read_csv(housing_price_path)

#### Looking at sum data with noise

Dropping the noisy target class as its not numerical. Also feature 5 as its 'meaning less'

In [73]:
sumdata_noise = sumdata_noise.drop('Noisy Target Class', axis = 1)
sumdata_noise = sumdata_noise.drop('Feature 5 (meaningless)', axis = 1)

sum_n_data_y = sumdata_noise['Noisy Target']
sum_n_data_X = sumdata_noise.drop('Noisy Target', axis = 1) 

#### Looking at sum data without noise

In [74]:
sumdata = sumdata.drop('Target Class', axis = 1)
sumdata = sumdata.drop('Feature 5 (meaningless)', axis = 1)

sumdata_y = sumdata['Target']
sumdata_X = sumdata.drop('Target', axis = 1) 

#### House data

In [None]:
housedata 

# Didn't do cross validation

In [15]:
sumdata_noise.shape

(968135, 12)

In [89]:
def model( X, y,  data_set, algorithm=LinearRegression): 
    
    algo_name = algorithm.__name__
    
    for chunk in data_chunks:

        if chunk > sumdata_noise.shape[0]: # if chunk is greater than the no. of examples
            break

        X = X.head(n = chunk)
        y = y.head(n = chunk)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        lm = algorithm()
        model = lm.fit(X_train, y_train)
        predictions = lm.predict(X_test)

        #print(predictions)
        
        error = mean_squared_error(y_test, predictions)**0.5  # RMSE https://www.kaggle.com/wiki/RootMeanSquaredError
        print ("Data set: {} Algorithm: {}  Chunk Size: {} Error: {}".format(data_set,
                                                                          algo_name,
                                                                          chunk,
                                                                           error
                                                                         ))
              
         
    

In [90]:
model(sum_n_data_X, sum_n_data_y,"Noisy Sum", LinearRegression )

Data set: Noisy Sum Algorithm: LinearRegression  Chunk Size: 100 Error: 84705.66263898367
Data set: Noisy Sum Algorithm: LinearRegression  Chunk Size: 500 Error: 93989.71430173799
Data set: Noisy Sum Algorithm: LinearRegression  Chunk Size: 1000 Error: 85319.2616559062
Data set: Noisy Sum Algorithm: LinearRegression  Chunk Size: 5000 Error: 67642.42256245787
Data set: Noisy Sum Algorithm: LinearRegression  Chunk Size: 10000 Error: 102962.18215748943
Data set: Noisy Sum Algorithm: LinearRegression  Chunk Size: 50000 Error: 73623.02653977406
Data set: Noisy Sum Algorithm: LinearRegression  Chunk Size: 100000 Error: 70314.34490255873
Data set: Noisy Sum Algorithm: LinearRegression  Chunk Size: 500000 Error: 97231.64688961724


In [65]:
model(sumdata_X, sumdata_y,"Sum without noise", LinearRegression )

Data set: Sum without noise Algorithm: LinearRegression  Chunk Size: 100 Score: 1.0
Data set: Sum without noise Algorithm: LinearRegression  Chunk Size: 500 Score: 1.0
Data set: Sum without noise Algorithm: LinearRegression  Chunk Size: 1000 Score: 1.0
Data set: Sum without noise Algorithm: LinearRegression  Chunk Size: 5000 Score: 1.0
Data set: Sum without noise Algorithm: LinearRegression  Chunk Size: 10000 Score: 1.0
Data set: Sum without noise Algorithm: LinearRegression  Chunk Size: 50000 Score: 1.0
Data set: Sum without noise Algorithm: LinearRegression  Chunk Size: 100000 Score: 1.0
Data set: Sum without noise Algorithm: LinearRegression  Chunk Size: 500000 Score: 1.0
