In [20]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import pandas as pd
from helper import get_news_dataset,get_data, create_classes

pd.options.display.max_columns = None

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import LogisticRegression

# Preprocessing libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from math import sqrt

# MAKE SURE THE DATASETS ARE CORRECT

 ![caption](files/requirements.png)

## Import DataSet

In [2]:
sumdata_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AABABUTdx7YqCeBquA1Ky7z8a/The%20SUM%20dataset?dl=1#"
news_url  = "https://www.dropbox.com/sh/euppz607r6gsen2/AACq4aMWDOIw2I_SSGqJ-r2Oa/Online%20News%20Popularity%20(Mashable%20News)?dl=1"
housing_url = "https://www.dropbox.com/sh/euppz607r6gsen2/AAD6JGlvG5XADIjg9SCojvpya/House%20Sales%20in%20King%20County%2C%20USA?dl=1&preview=kc_house_data.csv"
all_urls = [sumdata_url,news_url,housing_url]

In [3]:
get_data(all_urls) # retrieves the data if there is NO data folder

In [4]:
sumdata_noise_path = "data/with noise/The SUM dataset, with noise.csv"
sumdata_path = "data/without noise/The SUM dataset, without noise.csv" 
housing_price_path ="data/kc_house_data.csv" 

## Load datasets sum_noise

In [5]:
sumdata_noise = pd.read_csv(sumdata_noise_path, delimiter=";") 
sumdata_noise.head(n=2)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Noisy Target,Noisy Target Class
0,1,62485,58472,84200,86181,75529,136939,150633,230058,246491,257336,1352179,Very Large Number
1,2,75559,119137,146760,139674,19582,177083,217746,321110,434444,516798,1976446,Very Large Number


## Preprocess sum_noise dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Nosiy Target' as regression target
- Extract 'Nosiy Class' as classification target
- Extract rest columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [10]:
# Use 'Nosiy Target' as regression target
sumdata_noise_reg_Y = sumdata_noise['Noisy Target'].values.reshape(-1, 1)
le = LabelEncoder() 
# Use 'Nosiy Target Class' Large Number as regression target 
sumdata_noise_classif_Y = le.fit_transform(sumdata_noise['Noisy Target Class'])

# Use rest columns as explananatory variables 
sumdata_noise_X = sumdata_noise.iloc[:, 1:-2].values
# first column is instance (just a row number), and the last two features are target class and target
 
scX = StandardScaler() #standardising features
scY = StandardScaler() # standardising y 
sumdata_noise_X = scX.fit_transform(sumdata_noise_X)
sumdata_noise_reg_Y = scY.fit_transform(sumdata_noise_reg_Y)




## Load datasets sumdata

In [11]:
sumdata = pd.read_csv(sumdata_path, delimiter=";")  
sumdata.head(n=2)

Unnamed: 0,Instance,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless but please still use it),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Target,Target Class
0,1,57326,68791,82549,99059,72624,142645,171174,205409,246491,295789,1369233,Very Large Number
1,2,87859,105431,126517,151820,19982,218621,262345,314814,377777,453332,2098516,Very Large Number


## Preprocess sumdata dataset

- Remove 'Instance' as it simply represents the row number
- Extract 'Target' as regression target
- Extract 'Target Class' as classification target
- Extract rest of the columns as explananatory variables
- Apply Feature Scaling to the dataset 

- Ensure all dataframe has been converted to numpy array


In [12]:
# Use 'Nosiy Target' as regression target
sumdata_reg_Y = sumdata['Target'].values.reshape(-1, 1)

# Use 'Nosiy Target Class' Large Number as classification target
le = LabelEncoder() 
sumdata_classif_Y = le.fit_transform(sumdata['Target Class'])

# Use rest columns as explananatory variables 
sumdata_X = sumdata.iloc[:, 1:-2].values

 

scX = StandardScaler() #standardising features
scY = StandardScaler() # standardising y 
sumdata_X      = scX.fit_transform(sumdata_X)
sumdata_reg_Y  = scX.fit_transform(sumdata_reg_Y)






## News dataset

In [13]:
news = get_news_dataset() 
news.head(n=1)

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,num_keywords,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_bus,data_channel_is_socmed,data_channel_is_tech,data_channel_is_world,kw_min_min,kw_max_min,kw_avg_min,kw_min_max,kw_max_max,kw_avg_max,kw_min_avg,kw_max_avg,kw_avg_avg,self_reference_min_shares,self_reference_max_shares,self_reference_avg_sharess,weekday_is_monday,weekday_is_tuesday,weekday_is_wednesday,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,is_weekend,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,0.0,4.680365,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,496.0,496.0,496.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593


## Preprocess news dataset
- Below are the features  that will be selected 
-

In [14]:
columns= [ ' timedelta', ' n_tokens_title', ' n_tokens_content',
       ' n_unique_tokens', ' n_non_stop_words', ' n_non_stop_unique_tokens',
       ' num_hrefs', ' num_self_hrefs', ' num_imgs', ' num_videos',
       ' average_token_length', ' num_keywords', ' data_channel_is_lifestyle',
       ' data_channel_is_entertainment', ' data_channel_is_bus',
       ' data_channel_is_socmed', ' data_channel_is_tech',
       ' data_channel_is_world', ' kw_min_min', ' kw_max_min', ' kw_avg_min',
       ' kw_min_max', ' kw_max_max', ' kw_avg_max', ' kw_min_avg',
       ' kw_max_avg', ' kw_avg_avg', ' self_reference_min_shares',
       ' self_reference_max_shares', ' self_reference_avg_sharess', ' global_subjectivity', ' global_rate_positive_words',
       ' global_rate_negative_words', ' rate_positive_words',
       ' rate_negative_words', ' avg_positive_polarity',
       ' min_positive_polarity', ' max_positive_polarity',' title_subjectivity', ' abs_title_subjectivity',
       ' abs_title_sentiment_polarity', ' shares']

#### Classes
- 0 - 10            class 1
- 10 - 100          class 2 
- 101 - 1000        class 3
- 1001 - 10,000     class 4
- 10,001 - 100,000  class 5
- 100,001 -         class 6

In [17]:
news = news[columns] 
# Classification the target values 
news['label'] = news.apply(create_classes, axis=1)


In [18]:
news_reg_Y = news[' shares'].values.reshape(-1, 1)

news_reg_X = news.iloc[:, 0:-2].values # the last 2 columns are the target values 
# no need to label encode as they already encoded
news_classif_Y = news['label'] 

scX = StandardScaler() #standardising features
scY = StandardScaler() # standardising y 
news_reg_X  = scX.fit_transform(news_reg_X)
news_reg_Y  = scX.fit_transform(news_reg_Y)
 



# Fits Algorithms to datasets

In [70]:
data_chunks = [100, 500, 1000, 5000, 10000, 50000, 100000, 500000,
1000000, 5000000, 10000000, 50000000, 100000000]

In [71]:

def root_mean_square_error(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

In [72]:
def model( X, y, dataset_name, algorithm, isReg): 
    
    print ("Algorithm: {}\nDataset: {}\n".format( algorithm.__name__, dataset_name))
    for chunk in data_chunks:
        
        # if chunk is greater than the no. of examples quite from the chunking
        if chunk > X.shape[0]: 
            chunk = X.shape[0]
        
        print ("Chunk Size: {}\n".format(chunk))
        
        # generate the chunk file
        current_X = X[0:chunk]
        current_y = y[0:chunk]
        
        kFoldModelling(current_X, current_y, 10, algorithm, isReg)
        
        if chunk == X.shape[0]:
            break
              
         
    

In [73]:
def kFoldModelling (X, y, kfolds, algorithm, isReg):
    
    kf = KFold(n_splits=kfolds, shuffle=True)
    rmse = np.zeros((10,1))
    mae = np.zeros((10,1))
    accuracy = np.zeros((10,1))
    precision = np.zeros((10,1))
    
    
    for i, (train_index, test_index) in enumerate(kf.split(X)):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
      
        # fit the model to the datset
        lm = algorithm(multi_class="multinomial",solver='lbfgs' )
        lm.fit(X_train, y_train)
        
        if isReg:     
            rmse[i] = root_mean_square_error(y_test, lm.predict(X_test))  # RMSE https://www.kaggle.com/wiki/RootMeanSquaredError
            mae[i] = mean_absolute_error(y_test, lm.predict(X_test))
        else:
            from sklearn.metrics import accuracy_score
            accuracy[i] = accuracy_score(y_test, lm.predict(X_test))
            
            precision[i] = precision_score(y_test, lm.predict(X_test),average="weighted")
        # print the result, we will need to have method that genereates the result csv file required.
    
    if isReg:     
        print ("Iteration: {}\nRMSE: {}\nMAE: {}\n".format( i, rmse.mean(), mae.mean()))
    else:
        print ("Iteration: {}\Accuracy: {}\nPrecision: {}\n".format( i, accuracy.mean(), precision.mean()))
    

## Fits Regression Algorithms to datasets

    - Linear Regression
    - Random Forest Regression
    

In [74]:
# model(sumdata_noise_reg_X, sumdata_noise_reg_Y, "The Sum Dataset(with noise)", LinearRegression, True)
# model(sumdata_reg_X, sumdata_reg_Y, "The Sum Dataset(without noise)",  LinearRegression, True) 
# model(news_reg_X, news_reg_Y, "News dataset", LinearRegression, True)


# model(sumdata_noise_reg_X, sumdata_noise_reg_Y,"The Sum Dataset(with noise)",  RandomForestRegressor, True)
# model(sumdata_reg_X, sumdata_reg_Y, "The Sum Dataset(without noise)", RandomForestRegressor, True) 
# model(news_reg_X, news_reg_Y, "News dataset", RandomForestRegressor, True)



Algorithm: LogisticRegression
Dataset: News dataset

Chunk Size: 100

3 1
3 1
3 1
3 1
3 1
3 1
3 1
3 1
3 1
3 1


  'precision', 'predicted', average, warn_for)


Iteration: 9\Accuracy: 0.48999999999999994
Precision: 0.5284880952380953

Chunk Size: 500

3 1
3 1
3 1
3 1
3 1
3 1
3 1
3 1
3 1
3 1
Iteration: 9\Accuracy: 0.5980000000000001
Precision: 0.5241612835050995

Chunk Size: 1000

3 1
3 1
3 1
3 1
3 1
3 1
3 1
3 1
3 1
3 1
Iteration: 9\Accuracy: 0.642
Precision: 0.5732598401009589

Chunk Size: 5000

4 0
4 0
4 0
4 0
4 0
4 0
4 0
4 0
4 1
4 0
Iteration: 9\Accuracy: 0.696
Precision: 0.48555465324866603

Chunk Size: 10000

4 0
4 0
4 0
4 0
4 0
4 0
4 0
4 0
4 0
4 0
Iteration: 9\Accuracy: 0.6611999999999999
Precision: 0.5275562432303633

Chunk Size: 39644

4 0
4 0
4 0
4 0
4 0
4 0
4 0
4 0
4 0
4 0
Iteration: 9\Accuracy: 0.6310414219781311
Precision: 0.40256987234603486



### Classification algorithms

In [None]:

# model(sumdata_noise_classif_X, sumdata_noise_classif_Y, "The Sum Dataset(with noise)", LogisticRegression, False)
#model(sumdata_classif_X, sumdata_classif_Y, "The Sum Dataset(without noise)", LogisticRegression, False)
#model(news_reg_X, news_classif_Y, "News dataset", LogisticRegression, False)
 
# from sklearn.svm import SVC
# model(sumdata_noise_classi_X, sumdata_noise_classif_Y, "The Sum Dataset(with noise)", SVC, False)
# model(sumdata_classi_X, sumdata_classif_Y, "The Sum Dataset(without noise)", SVC, False)
# model(housing_price_classi_X, housing_price_classif_Y, "Housing Dataset", SVC, False)
# model(titanic_classification_X, titanic_classification_y, "Titanic Dataset", SVC, False)
