In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.svm import OneClassSVM, SVR
from sklearn.linear_model import Ridge, SGDRegressor, Lasso

from own_gradient_descent_regressor import OwnGradientDescentRegressor
from own_knn_regressor import OwnKNeighborsRegressor

# set random seed 
RSEED = 123

In [97]:
# Test read in data
data_metro = pd.read_csv("data/dataset_MetroInterstateTrafficVolume.csv", sep=",")
data_news = pd.read_csv("data/dataset_OnlineNewsPopularity.csv", sep=",")
data_realestate = pd.read_excel("data/dataset_RealEstateValuation.xlsx")

In [98]:
def train_and_predict(model, X_train, X_test, y_train, own_knn):
    if(own_knn):
        return model.findknearestNeighbors(X_train, y_train, X_test)
    
    model.fit(X_train,y_train.values.ravel())
    return model.predict(X_test)

def evaluate_prediction(y_test, y_pred, data_name, model_name):
    mse = mean_squared_error(y_test, y_pred, squared=True)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    print(model_name, ', ', data_name)
    print("MSE: ",mse,"\nRMSE: ",rmse,"\nR2 Score: ",r2)
    print("-"*50)
    # g=plt.scatter(y_test, y_pred)
    # g.axes.set_xlabel('True Values ')
    # g.axes.set_ylabel('Predictions ')

In [99]:
def scale_data(X_train, X_test, y_train, y_test):
    # reset index
    X_train = X_train.reset_index().drop(['index'], axis=1)
    X_test = X_test.reset_index().drop(['index'], axis=1)
    y_train = y_train.reset_index().drop(['index'], axis=1)
    y_test = y_test.reset_index().drop(['index'], axis=1)
    
    scaled_features_train = X_train.copy()
    scaled_features_test = X_test.copy()

    # only select numeric columns
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    columns_to_scale = X_train.select_dtypes(include=numerics).columns
    
    features_to_scale_train = scaled_features_train[columns_to_scale]
    features_to_scale_test = scaled_features_test[columns_to_scale]
    
    scaler = StandardScaler()
    scaler.fit(features_to_scale_train)
    scaled_features = pd.DataFrame(scaler.transform(features_to_scale_train))
    scaled_features_train[columns_to_scale] = scaled_features
    scaled_features = pd.DataFrame(scaler.transform(features_to_scale_test))
    scaled_features_test[columns_to_scale] = scaled_features
    
    return scaled_features_train, scaled_features_test, y_train, y_test

def process_missing_values(X_train, y_train):
    # drop missing values
    X_train = X_train.dropna()
    y_train = y_train.dropna()
    
    return X_train, y_train

def process_outliers(X_train, y_train):
    # only select numeric columns
    numerics = ['uint8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    X_train = X_train.select_dtypes(include=numerics)

    # identify outliers in the training dataset
    outlier_predictor = OneClassSVM(nu=0.02)
    y_hat = outlier_predictor.fit_predict(X_train)
    unique, counts = np.unique(y_hat, return_counts=True)

    # select all rows that are not outliers
    outlier_mask = y_hat != -1
    return X_train[outlier_mask], y_train[outlier_mask]


def preprocess_data(X_train, X_test, y_train, y_test):
    X_train, y_train = process_outliers(X_train, y_train)
    X_train, y_train = process_missing_values(X_train, y_train)
    X_train, X_test, y_train, y_test = scale_data(X_train, X_test, y_train, y_test)
    return X_train, X_test, y_train, y_test

In [100]:
def split_data(X,y):
    kfold = KFold(n_splits=10, random_state=RSEED, shuffle=True)
    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index,:], y.iloc[test_index,:]

    return X_train, X_test, y_train, y_test

In [101]:
def prepare_metro_data(data):
    data['date_time'] = pd.to_datetime(data['date_time'])
    data['year'] = pd.DatetimeIndex(data['date_time']).year
    data['month'] = pd.DatetimeIndex(data['date_time']).month
    data['week'] = data['date_time'].dt.week
    data['day'] = pd.DatetimeIndex(data['date_time']).day
    data['hour'] = data['date_time'].dt.hour
    data['weather_main'] = data['weather_main'].astype('category')
    data['weather_description'] = data['weather_description'].astype('category')
    data = data.drop(['date_time'], axis=1)
    dummy_columns = ['holiday', 'weather_main', 'weather_description']
    dummies = pd.get_dummies(pd.DataFrame(data[dummy_columns]))
    data = data.join(dummies)
    data = data.drop(['weather_main'], axis=1)
    data = data.drop(['weather_description'], axis=1)
    data = data.drop(['holiday'], axis=1)
    X = data.drop(['traffic_volume'], axis=1)
    y = pd.DataFrame(data['traffic_volume'])
    return data, X, y

def prepare_news_data(data):
    data = data_news.iloc[:,2:]
    X = data.iloc[:,0:58]
    y = data.iloc[:,58:59]
    return data, X, y

def prepare_real_estate_data(data):
    data = data.iloc[:,1:]
    x_columns = data.iloc[:,0:6].columns.str[3:]
    y_column = data.iloc[:,6:7].columns.str[2:]
    columns = x_columns.append(y_column)
    data.columns = columns
    X = data.iloc[:,0:6]
    y = data.iloc[:,6:7]
    return data, X, y

In [102]:
def get_data(filename, file_type):
    if (file_type == 'csv'):
        return pd.read_csv(filename, sep=",")
    return pd.read_excel(filename)

In [103]:
def evaluate_model_on_metro_data(model, model_name, own_knn):
    dataset_name = "data/dataset_MetroInterstateTrafficVolume.csv"
    data = get_data(dataset_name, 'csv')
    data_prepared, X, y = prepare_metro_data(data)
    X_train, X_test, y_train, y_test = split_data(X,y)
    X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)
    predictions = train_and_predict(model, X_train, X_test, y_train, own_knn)
    evaluate_prediction(y_test, predictions, 'Metro Traffic Data', model_name)
    
def evaluate_model_on_news_data(model, model_name, own_knn):
    dataset_name = "data/dataset_OnlineNewsPopularity.csv"
    data = get_data(dataset_name, 'csv')
    data_prepared, X, y = prepare_news_data(data)
    X_train, X_test, y_train, y_test = split_data(X,y)
    X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)
    predictions = train_and_predict(model, X_train, X_test, y_train, own_knn)
    evaluate_prediction(y_test, predictions, 'Online News Data', model_name)

def evaluate_model_on_real_estate_data(model, model_name, own_knn):
    dataset_name = "data/dataset_RealEstateValuation.xlsx"
    data = get_data(dataset_name, 'xlsx')
    data_prepared, X, y = prepare_real_estate_data(data)
    X_train, X_test, y_train, y_test = split_data(X,y)
    X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)
    predictions = train_and_predict(model, X_train, X_test, y_train, own_knn)
    evaluate_prediction(y_test, predictions, 'Real Estate Data', model_name)

def evaluate_model(model, model_name, own_knn=False):
    # evaluate_model_on_metro_data(model, model_name, own_knn)
    # evaluate_model_on_news_data(model, model_name, own_knn)
    evaluate_model_on_real_estate_data(model, model_name, own_knn)

In [104]:
# define model
model_svr = SVR()
model_ridge = Ridge()
model_lasso = Lasso()
gdr_own = OwnGradientDescentRegressor()
model_own_knn_1 = OwnKNeighborsRegressor(n_neighbors=1)
model_own_knn_5 = OwnKNeighborsRegressor(n_neighbors=5)
model_own_knn_10 = OwnKNeighborsRegressor(n_neighbors=10)

# evaluate_model(model_lasso, 'Lasso')
# evaluate_model(model_ridge, 'Ridge')
# evaluate_model(model_svr, 'SVR')
# evaluate_model(model_own_knn_1, 'Own_KNN N=1', True)
# evaluate_model(model_own_knn_5, 'Own_KNN N=5', True)
# evaluate_model(model_own_knn_10, 'Own_KNN N=10', True)
# evaluate_model(gdr_own, 'SGDR')