In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.svm import OneClassSVM, SVR
from sklearn.linear_model import Ridge, SGDRegressor, Lasso

# set random seed 
RSEED = 123

In [2]:
# Test read in data
data_metro = pd.read_csv("data/dataset_MetroInterstateTrafficVolume.csv", sep=",")
data_news = pd.read_csv("data/dataset_OnlineNewsPopularity.csv", sep=",")
data_realestate = pd.read_excel("data/dataset_RealEstateValuation.xlsx")
data_metro

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918
...,...,...,...,...,...,...,...,...,...
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00,3543
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00,2781
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00,2159
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00,1450


In [135]:
def train_and_predict(model, X_train, X_test, y_train):
    model.fit(X_train,y_train.values.ravel())
    return model.predict(X_test)

def evaluate_prediction(y_test, y_pred, data_name, model_name):
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(model_name, ', ', data_name)
    print("MSE: ",mse,"\nR2 Score: ",r2)
    print("-"*50)
    # g=plt.scatter(y_test, y_pred)
    # g.axes.set_xlabel('True Values ')
    # g.axes.set_ylabel('Predictions ')

In [127]:
def scale_data(X_train, X_test):
    # reset index
    X_train = X_train.reset_index().drop(['index'], axis=1)
    X_test = X_test.reset_index().drop(['index'], axis=1)
    
    scaled_features_train = X_train.copy()
    scaled_features_test = X_test.copy()

    # only select numeric columns
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    columns_to_scale = X_train.select_dtypes(include=numerics).columns
    
    features_to_scale_train = scaled_features_train[columns_to_scale]
    features_to_scale_test = scaled_features_test[columns_to_scale]
    
    scaler = StandardScaler()
    scaler.fit(features_to_scale_train)
    scaled_features = pd.DataFrame(scaler.transform(features_to_scale_train))
    scaled_features_train[columns_to_scale] = scaled_features
    scaled_features = pd.DataFrame(scaler.transform(features_to_scale_test))
    scaled_features_test[columns_to_scale] = scaled_features
    
    return scaled_features_train, scaled_features_test

def process_missing_values(X_train, y_train):
    # drop missing values
    X_train = X_train.dropna()
    y_train = y_train.dropna()
    
    return X_train, y_train

def process_outliers(X_train, y_train):
    # only select numeric columns
    numerics = ['uint8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    X_train = X_train.select_dtypes(include=numerics)

    # identify outliers in the training dataset
    outlier_predictor = OneClassSVM(nu=0.02)
    y_hat = outlier_predictor.fit_predict(X_train)
    unique, counts = np.unique(y_hat, return_counts=True)

    # select all rows that are not outliers
    outlier_mask = y_hat != -1
    return X_train[outlier_mask], y_train[outlier_mask]


def preprocess_data(X_train, X_test, y_train, y_test):
    X_train, y_train = process_outliers(X_train, y_train)
    X_train, y_train = process_missing_values(X_train, y_train)
    X_train, X_test = scale_data(X_train, X_test)
    return X_train, X_test, y_train, y_test

In [128]:
def split_data(X,y):
    kfold = KFold(n_splits=10, random_state=RSEED, shuffle=True)
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index,:], y.iloc[test_index,:]

    return X_train, X_test, y_train, y_test

In [129]:
def prepare_metro_data(data):
    data['date_time'] = pd.to_datetime(data['date_time'])
    data['year'] = pd.DatetimeIndex(data['date_time']).year
    data['month'] = pd.DatetimeIndex(data['date_time']).month
    data['week'] = data['date_time'].dt.week
    data['day'] = pd.DatetimeIndex(data['date_time']).day
    data['hour'] = data['date_time'].dt.hour
    data['weather_main'] = data['weather_main'].astype('category')
    data['weather_description'] = data['weather_description'].astype('category')
    data = data.drop(['date_time'], axis=1)
    dummy_columns = ['holiday', 'weather_main', 'weather_description']
    dummies = pd.get_dummies(pd.DataFrame(data[dummy_columns]))
    data = data.join(dummies)
    data = data.drop(['weather_main'], axis=1)
    data = data.drop(['weather_description'], axis=1)
    data = data.drop(['holiday'], axis=1)
    X = data.drop(['traffic_volume'], axis=1)
    y = pd.DataFrame(data['traffic_volume'])
    return data, X, y

def prepare_news_data(data):
    data = data_news.iloc[:,2:]
    X = data.iloc[:,0:58]
    y = data.iloc[:,58:59]
    return data, X, y

def prepare_real_estate_data(data):
    data = data.iloc[:,1:]
    x_columns = data.iloc[:,0:6].columns.str[3:]
    y_column = data.iloc[:,6:7].columns.str[2:]
    columns = x_columns.append(y_column)
    data.columns = columns
    X = data.iloc[:,0:6]
    y = data.iloc[:,6:7]
    return data, X, y

In [130]:
def get_data(filename, file_type):
    if (file_type == 'csv'):
        return pd.read_csv(filename, sep=",")
    return pd.read_excel(filename)

In [133]:
def evaluate_model_on_metro_data(model, model_name):
    dataset_name = "data/dataset_MetroInterstateTrafficVolume.csv"
    data = get_data(dataset_name, 'csv')
    data_prepared, X, y = prepare_metro_data(data)
    X_train, X_test, y_train, y_test = split_data(X,y)
    X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)
    predictions = train_and_predict(model, X_train, X_test, y_train)
    evaluate_prediction(y_test, predictions, 'Metro Traffic Data', model_name)
    
def evaluate_model_on_news_data(model, model_name):
    dataset_name = "data/dataset_OnlineNewsPopularity.csv"
    data = get_data(dataset_name, 'csv')
    data_prepared, X, y = prepare_news_data(data)
    X_train, X_test, y_train, y_test = split_data(X,y)
    X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)
    predictions = train_and_predict(model, X_train, X_test, y_train)
    evaluate_prediction(y_test, predictions, 'Online News Data', model_name)

def evaluate_model_on_real_estate_data(model, model_name):
    dataset_name = "data/dataset_RealEstateValuation.xlsx"
    data = get_data(dataset_name, 'xlsx')
    data_prepared, X, y = prepare_real_estate_data(data)
    X_train, X_test, y_train, y_test = split_data(X,y)
    X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)
    predictions = train_and_predict(model, X_train, X_test, y_train)
    evaluate_prediction(y_test, predictions, 'Real Estate Data', model_name)

def evaluate_model(model, model_name):
    evaluate_model_on_metro_data(model, model_name)
    evaluate_model_on_news_data(model, model_name)
    evaluate_model_on_real_estate_data(model, model_name)

In [136]:
# define model
model_svr = SVR()
model_ridge = Ridge()
model_lasso = Lasso()

evaluate_model(model_lasso, 'Lasso')
evaluate_model(model_ridge, 'Ridge')
evaluate_model(model_svr, 'SVR')

Lasso ,  Metro Traffic Data
MSE:  3425543.4567353027 
R2 Score:  0.13794854248935695
--------------------------------------------------


  positive)


Lasso ,  Online News Data
MSE:  55343111.87012531 
R2 Score:  -0.007111987771488915
--------------------------------------------------
Lasso ,  Real Estate Data
MSE:  116.47192611377926 
R2 Score:  0.45201503285664735
--------------------------------------------------
Ridge ,  Metro Traffic Data
MSE:  3418693.946038037 
R2 Score:  0.13967224874338302
--------------------------------------------------
Ridge ,  Online News Data
MSE:  55318562.30385242 
R2 Score:  -0.006665244506563139
--------------------------------------------------
Ridge ,  Real Estate Data
MSE:  122.68646047793098 
R2 Score:  0.42277647277630515
--------------------------------------------------
SVR ,  Metro Traffic Data
MSE:  3289440.322996806 
R2 Score:  0.172199430353689
--------------------------------------------------
SVR ,  Online News Data
MSE:  57343813.60153968 
R2 Score:  -0.04351996393284141
--------------------------------------------------
SVR ,  Real Estate Data
MSE:  108.62199884867087 
R2 Score:  0.4