In [2]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import pandas as pd
from helper import get_news_dataset,get_data, create_classes

pd.options.display.max_columns = None

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import LogisticRegression

# Preprocessing libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from math import sqrt

# MAKE SURE THE DATASETS ARE CORRECT

 ![caption](files/requirements.png)

## Import DataSet

In [3]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
news_url  = "https://www.dropbox.com/sh/euppz607r6gsen2/AACq4aMWDOIw2I_SSGqJ-r2Oa/Online%20News%20Popularity%20(Mashable%20News)?dl=1"
housing_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAD6JGlvG5XADIjg9SCojvpya/House%20Sales%20in%20King%20County%2C%20USA?dl=1&preview=kc_house_data.csv"
all_urls = [sumdata_url,news_url,housing_url]

In [4]:
get_data(all_urls) # retrieves the data if there is NO data folder

In [5]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv" 
housing_price_path ="data/kc_house_data.csv" 

## Load datasets sum_noise

In [6]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";") 
sumdata_noise.head(n=2)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,66957,74432,96087,103120,64272,150633,181787,180349,216912,304071,1434819,Very Large Number
1,2,96030,86875,108299,148025,16965,253819,258672,268851,404599,543092,2148748,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [7]:
# Use 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise['Noisy Target'].values.reshape(-1, 1)

# Use 'Nosiy Target Class' Large Number as regression target 
le = LabelEncoder() 
sumdata_noise_classif_Y = le.fit_transform(sumdata_noise['Noisy Target Class'])

# Use rest columns as explananatory variables 
sumdata_noise_X = sumdata_noise.iloc[:, 1:-2].values
# first column is instance (just a row number), and the last two features are target class and target
 
scX = StandardScaler() #standardising features
scY = StandardScaler() # standardising y 
sumdata_noise_X = scX.fit_transform(sumdata_noise_X)
sumdata_noise_reg_Y = scY.fit_transform(sumdata_noise_reg_Y)




## Load datasets sumdata

In [8]:
sumdata = pd.read_csv(sumdata_path, delimiter=";")  
sumdata.head(n=2)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Target,Target Class
0,1,57326,68791,82549,99059,72624,142645,171174,205409,246491,295789,1073444,Very Large Number
1,2,87859,105431,126517,151820,19982,218621,262345,314814,377777,453332,1645184,Very Large Number


## Preprocess sumdata dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Target' as regression target
- Extract 'Target Class' as classification target
- Extract rest of the columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [9]:
# Use 'Nosiy Target' as regression target
sumdata_reg_Y = sumdata['Target'].values.reshape(-1, 1)

# Use 'Nosiy Target Class' Large Number as classification target
le = LabelEncoder() 
sumdata_classif_Y = le.fit_transform(sumdata['Target Class'])

# Use rest of columns as explananatory variables 
sumdata_X = sumdata.iloc[:, 1:-2].values

 
scX = StandardScaler() #standardising features
scY = StandardScaler() # standardising y 
sumdata_X      = scX.fit_transform(sumdata_X)
sumdata_reg_Y  = scX.fit_transform(sumdata_reg_Y)






 ## Load House Price dataset

In [10]:
housing_price = pd.read_csv(housing_price_path, delimiter=",")
housing_price.head(n=10)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
5,7237550310,20140512T000000,1225000.0,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930
6,1321400060,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
7,2008000270,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711
8,2414600126,20150415T000000,229500.0,3,1.0,1780,7470,1.0,0,0,3,7,1050,730,1960,0,98146,47.5123,-122.337,1780,8113
9,3793500160,20150312T000000,323000.0,3,2.5,1890,6560,2.0,0,0,3,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570


## Preprocess housing price dataset

- Remove 'Id' as it simply represents the row number
- Use 'price' as regression target
- Use 'SaleCondition' as classification target
- For explananatory variables, use the following numerical variable and categorical variables
    - Numerical
        - bedrooms
        - bathrooms
        - sqft_living
        - sqft_lot
        - condition
        - sqft_above
        - yr_built
        - sqft_living15
        - sqft_lot15
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [15]:
# Use 'price' as regression target
housing_price_reg_Y = housing_price.loc[:, ['price']]

# Use following features to predict price
features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'condition', 'sqft_above', 'yr_built',
                      'sqft_living15', 'sqft_lot15']

housing_price[features[0]]
filtered_table = housing_price[features[0]]
filtered_table

for c in features[1:]:
    filtered_table = pd.concat([filtered_table, housing_price[c]], axis=1)
filtered_table

# get explanatory variable
housing_price_reg_X = filtered_table[:].values
# housing_price_classif_X = filtered_table[:].values

## get the regression target and classification target

# I am encoding price as categorical variable as the classification target, e.g try to determine if the house condition
housing_price_classif_Y = np.zeros(housing_price.shape[0])
house_price_mean = housing_price['price'].values.mean()
for i, r in enumerate(housing_price['price']):
    if r > house_price_mean:
        housing_price_classif_Y[i] = 1
    else:
        housing_price_classif_Y[i] = 0
        
# I am using price as the regression target
housing_price_reg_Y = housing_price['price'].values.reshape(-1,1)



# Apply Feature Scaling to the classification variable
scX = StandardScaler()
scY = StandardScaler()
housing_price_classif_X = scX.fit_transform(housing_price_reg_X[:])
housing_price_reg_X = scX.fit_transform(housing_price_reg_X)
housing_price_reg_Y = scY.fit_transform(housing_price_reg_Y)
housing_price_reg_X

array([[-0.39873715, -1.44746357, -0.97983502, ..., -0.54489777,
        -0.9433552 , -0.26071541],
       [-0.39873715,  0.1756067 ,  0.53363434, ..., -0.6810785 ,
        -0.43268619, -0.18786773],
       [-1.47395936, -1.44746357, -1.42625404, ..., -1.29389179,
         1.07013975, -0.17237524],
       ..., 
       [-1.47395936, -1.77207762, -1.15404732, ...,  1.29354209,
        -1.41025258, -0.39414129],
       [-0.39873715,  0.50022075, -0.52252773, ...,  1.12331618,
        -0.8412214 , -0.42051149],
       [-1.47395936, -1.77207762, -1.15404732, ...,  1.25949691,
        -1.41025258, -0.41794772]])

## News dataset

In [16]:
news = get_news_dataset() 
news.head(n=1)

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,0.0,4.680365,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,496.0,496.0,496.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593


## Preprocess news dataset
- Below are the features  that will be selected 
-

In [17]:
columns= [ ' timedelta', ' n_tokens_title', ' n_tokens_content',
       ' n_unique_tokens', ' n_non_stop_words', ' n_non_stop_unique_tokens',
       ' num_hrefs', ' num_self_hrefs', ' num_imgs', ' num_videos',
       ' average_token_length', ' num_keywords', ' data_channel_is_lifestyle',
       ' data_channel_is_entertainment', ' data_channel_is_bus',
       ' data_channel_is_socmed', ' data_channel_is_tech',
       ' data_channel_is_world', ' kw_min_min', ' kw_max_min', ' kw_avg_min',
       ' kw_min_max', ' kw_max_max', ' kw_avg_max', ' kw_min_avg',
       ' kw_max_avg', ' kw_avg_avg', ' self_reference_min_shares',
       ' self_reference_max_shares', ' self_reference_avg_sharess', ' global_subjectivity', ' global_rate_positive_words',
       ' global_rate_negative_words', ' rate_positive_words',
       ' rate_negative_words', ' avg_positive_polarity',
       ' min_positive_polarity', ' max_positive_polarity',' title_subjectivity', ' abs_title_subjectivity',
       ' abs_title_sentiment_polarity', ' shares']

#### Classes
- 0 - 10            class 1
- 10 - 100          class 2 
- 101 - 1000        class 3
- 1001 - 10,000     class 4
- 10,001 - 100,000  class 5
- 100,001 -         class 6

In [18]:
news = news[columns] 
# Classification the target values 
news['label'] = news.apply(create_classes, axis=1)


In [19]:
news_reg_Y = news[' shares'].values.reshape(-1, 1)

news_reg_X = news.iloc[:, 0:-2].values # the last 2 columns are the target values 
# no need to label encode as they already encoded
news_classif_Y = news['label'] 

scX = StandardScaler() #standardising features
scY = StandardScaler() # standardising y 
news_reg_X  = scX.fit_transform(news_reg_X)
news_reg_Y  = scX.fit_transform(news_reg_Y)
 



# Fits Algorithms to datasets

In [20]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

In [21]:

def root_mean_square_error(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [22]:
def model( X, y, dataset_name, algorithm, isReg, algo_params={}): 
    
    print ("Algorithm: {}\nDataset: {}\n".format( algorithm.__name__, dataset_name))
    for chunk in data_chunks:
        
        # if chunk is greater than the no. of examples quite from the chunking
        if chunk > X.shape[0]: 
            break
        
        print ("Chunk Size: {}\n".format(chunk))
        
        # generate the chunk file
        current_X = X[0:chunk]
        current_y = y[0:chunk]
        
        kFoldModelling(current_X, current_y, 10, algorithm, isReg, algo_params)
        
         
         
    

In [23]:
def kFoldModelling (X, y, kfolds, algorithm, isReg, algo_params):
    
    kf = KFold(n_splits=kfolds, shuffle=True)
    rmse = np.zeros((10,1))
    mae = np.zeros((10,1))
    accuracy = np.zeros((10,1))
    precision = np.zeros((10,1))
    
    
    for i, (train_index, test_index) in enumerate(kf.split(X)):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
      
        # fit the model to the datset
        lm = algorithm(**algo_params)
        lm.fit(X_train, y_train)
        
        if isReg:     
            rmse[i] = root_mean_square_error(y_test, lm.predict(X_test))  # RMSE https://www.kaggle.com/wiki/RootMeanSquaredError
            mae[i] = mean_absolute_error(y_test, lm.predict(X_test))
        else:
            from sklearn.metrics import accuracy_score
            accuracy[i] = accuracy_score(y_test, lm.predict(X_test))
            
            precision[i] = precision_score(y_test, lm.predict(X_test),average="weighted")
        # print the result, we will need to have method that genereates the result csv file required.
    
    if isReg:     
        print ("Iteration: {}\nRMSE: {}\nMAE: {}\n".format( i, rmse.mean(), mae.mean()))
    else:
        print ("Iteration: {}\Accuracy: {}\nPrecision: {}\n".format( i, accuracy.mean(), precision.mean()))
    

## Fits Regression Algorithms to datasets

    - Linear Regression
    - Random Forest Regression
    

In [26]:
# model(sumdata_noise_X, sumdata_noise_reg_Y, "The Sum Dataset(with noise)", LinearRegression, True)
# model(sumdata_X, sumdata_reg_Y, "The Sum Dataset(without noise)",  LinearRegression, True) 
# model(news_reg_X, news_reg_Y, "News dataset", LinearRegression, True)
model(housing_price_reg_X, housing_price_reg_Y, "House dataset", LinearRegression, True)


# model(sumdata_noise_X, sumdata_noise_reg_Y,"The Sum Dataset(with noise)",  RandomForestRegressor, True)
# model(sumdata_X, sumdata_reg_Y, "The Sum Dataset(without noise)", RandomForestRegressor, True) 
# model(news_reg_X, news_reg_Y, "News dataset", RandomForestRegressor, True)



Algorithm: LinearRegression
Dataset: House dataset

Chunk Size: 100

Iteration: 9
RMSE: 0.6647217581335125
MAE: 0.4907503364653518

Chunk Size: 500

Iteration: 9
RMSE: 0.633835735238472
MAE: 0.4250613211814353

Chunk Size: 1000

Iteration: 9
RMSE: 0.5966280460993664
MAE: 0.4068109395465417

Chunk Size: 5000

Iteration: 9
RMSE: 0.6894764906664024
MAE: 0.4402858540189084

Chunk Size: 10000

Iteration: 9
RMSE: 0.6680853786524494
MAE: 0.42950714804040435

Chunk Size: 21613

Iteration: 9
RMSE: 0.6588125051124283
MAE: 0.433497291354014



### Classification algorithms

In [25]:

# model(sumdata_noise_X, sumdata_noise_classif_Y, "The Sum Dataset(with noise)", LogisticRegression, False)
# model(sumdata_noise_X, sumdata_classif_Y, "The Sum Dataset(without noise)", LogisticRegression, False)
#model(news_reg_X, news_classif_Y, "News dataset", LogisticRegression, False)
model(housing_price_classif_X, housing_price_classif_Y, "House dataset", LogisticRegression, False)
 
# from sklearn.svm import SVC
# model(sumdata_noise_classi_X, sumdata_noise_classif_Y, "The Sum Dataset(with noise)", SVC, False)
# model(sumdata_classi_X, sumdata_classif_Y, "The Sum Dataset(without noise)", SVC, False)
# model(housing_price_classi_X, housing_price_classif_Y, "Housing Dataset", SVC, False)


Algorithm: LogisticRegression
Dataset: House dataset

Chunk Size: 100

Iteration: 9\Accuracy: 0.8
Precision: 0.8118333333333332

Chunk Size: 500

Iteration: 9\Accuracy: 0.808
Precision: 0.8082546136760685

Chunk Size: 1000

Iteration: 9\Accuracy: 0.8229999999999998
Precision: 0.8236591843018337

Chunk Size: 5000

Iteration: 9\Accuracy: 0.8089999999999999
Precision: 0.8074454614646053

Chunk Size: 10000



  'precision', 'predicted', average, warn_for)


Iteration: 9\Accuracy: 0.8091000000000002
Precision: 0.8068040859293317

Chunk Size: 21613

Iteration: 9\Accuracy: 0.8047017154236592
Precision: 0.8025499633711558

