In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.io.arff import loadarff

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
!ls drive/MyDrive/KP_datasets

dataset_2183_cpu_act.arff    house_16H.arff
dataset_2202_elevators.arff  houses.arff


In [4]:
DATA_PATH = 'drive/MyDrive/KP_datasets/'
# dataset_2183_cpu_act.arff       target: usr
# house_16H.arff                  target: price
# dataset_2202_elevators.arff     target: Goal
# houses.arff                     target: median_house_value

In [5]:
DATASET_NAME = 'dataset_2202_elevators.arff'
target_col = 'Goal'

In [6]:
data = loadarff(DATA_PATH + DATASET_NAME)
df = pd.DataFrame(data[0])

In [7]:
df.head()

Unnamed: 0,climbRate,Sgz,'p','q',curRoll,absRoll,diffClb,diffRollRate,diffDiffClb,SaTime1,SaTime2,SaTime3,SaTime4,diffSaTime1,diffSaTime2,diffSaTime3,diffSaTime4,Sa,Goal
0,118.0,-55.0,-0.28,-0.08,-0.2,-11.0,11.0,0.005,-0.2,-0.001,-0.001,-0.001,-0.001,0.0,0.0,0.0,0.0,-0.001,0.031
1,390.0,-45.0,-0.06,-0.07,-0.6,-12.0,11.0,0.01,-0.2,-0.0008,-0.0008,-0.0008,-0.0008,0.0,0.0,0.0,0.0,-0.0008,0.034
2,68.0,6.0,0.11,0.15,0.6,-10.0,-9.0,-0.003,-0.2,-0.0011,-0.001,-0.001,-0.001,-0.0002,0.0,0.0,0.0,-0.001,0.033
3,-358.0,-12.0,-0.2,0.13,-0.3,-11.0,-7.0,0.001,-0.1,-0.001,-0.001,-0.001,-0.001,0.0,0.0,0.0,0.0,-0.001,0.032
4,-411.0,-19.0,-0.18,0.02,-0.5,-11.0,-3.0,0.002,1.2,-0.001,-0.001,-0.001,-0.001,0.0,0.0,0.0,0.0,-0.001,0.03


In [8]:
y = df[target_col]
data = df.drop(columns=[target_col])

In [9]:
# y

In [10]:
data

Unnamed: 0,climbRate,Sgz,'p','q',curRoll,absRoll,diffClb,diffRollRate,diffDiffClb,SaTime1,SaTime2,SaTime3,SaTime4,diffSaTime1,diffSaTime2,diffSaTime3,diffSaTime4,Sa
0,118.0,-55.0,-0.28,-0.08,-0.2,-11.0,11.0,0.005,-0.2,-0.0010,-0.0010,-0.0010,-0.0010,0.0000,0.0,0.0,0.0,-0.0010
1,390.0,-45.0,-0.06,-0.07,-0.6,-12.0,11.0,0.010,-0.2,-0.0008,-0.0008,-0.0008,-0.0008,0.0000,0.0,0.0,0.0,-0.0008
2,68.0,6.0,0.11,0.15,0.6,-10.0,-9.0,-0.003,-0.2,-0.0011,-0.0010,-0.0010,-0.0010,-0.0002,0.0,0.0,0.0,-0.0010
3,-358.0,-12.0,-0.20,0.13,-0.3,-11.0,-7.0,0.001,-0.1,-0.0010,-0.0010,-0.0010,-0.0010,0.0000,0.0,0.0,0.0,-0.0010
4,-411.0,-19.0,-0.18,0.02,-0.5,-11.0,-3.0,0.002,1.2,-0.0010,-0.0010,-0.0010,-0.0010,0.0000,0.0,0.0,0.0,-0.0010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16594,299.0,-28.0,0.08,-0.12,-0.3,-9.0,15.0,0.010,-0.2,-0.0005,-0.0005,-0.0005,-0.0005,0.0000,0.0,0.0,0.0,-0.0005
16595,84.0,0.0,0.14,0.14,1.1,-8.0,-11.0,-0.014,-0.6,-0.0009,-0.0009,-0.0009,-0.0009,0.0000,0.0,0.0,0.0,-0.0009
16596,-208.0,-6.0,-0.48,0.09,0.2,-9.0,-7.0,-0.010,-0.1,-0.0009,-0.0009,-0.0009,-0.0009,0.0000,0.0,0.0,0.0,-0.0009
16597,-146.0,-14.0,-0.38,-0.03,-0.8,-10.0,10.0,0.010,-1.0,-0.0005,-0.0005,-0.0005,-0.0005,0.0000,0.0,0.0,0.0,-0.0005


In [11]:
from sklearn.model_selection import KFold

from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error, mean_squared_error

SEED = 42

In [14]:
# !pip install catboost

Installing collected packages: catboost
Successfully installed catboost-1.2


In [15]:
kf = KFold(n_splits=3, shuffle=True, random_state=SEED)

def train(model, kf):
  metrics_dict = dict()

  for i, (train_index, test_index) in enumerate(kf.split(data)):
    train_data = data.iloc[train_index]
    test_data = data.iloc[test_index]

    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]


    model.fit(train_data, y_train)
    y_pred = model.predict(test_data)

    metrics_dict[f'fold_{i}'] = {'r2': r2_score(y_test, y_pred),
                                'MAE': mean_absolute_error(y_test, y_pred),
                                'MSE': mean_squared_error(y_test, y_pred),
                                'MAPE': mean_absolute_percentage_error(y_test, y_pred)}
    print(metrics_dict[f'fold_{i}'])
  return metrics_dict

def get_average_metrics(metrics_dict):
  key = list(metrics_dict.keys())[0]
  metrics_list = list(metrics_dict[key].keys())

  result = {}

  for metric in metrics_list:
    result[metric] = 0
    i = 0
    for fold in metrics_dict.keys():
      result[metric] += metrics_dict[fold][metric]
      i += 1
    result[metric] /= i
  return result


In [25]:
from catboost import Pool, CatBoostRegressor

model = CatBoostRegressor(iterations=3000,
                          depth=7,
                          learning_rate=0.01,
                          loss_function='RMSE',
                          verbose=0)

In [26]:
metrics_dict = dict()

In [27]:
metrics_dict = train(model, kf)

{'r2': 0.8993021688273953, 'MAE': 0.0015403798477585406, 'MSE': 4.53492352405859e-06, 'MAPE': 0.07235414927287098}
{'r2': 0.9009813245328222, 'MAE': 0.001538980338477244, 'MSE': 4.553640363241046e-06, 'MAPE': 0.07185691609302762}
{'r2': 0.8970336008580198, 'MAE': 0.0015397099827191576, 'MSE': 4.5638973437617145e-06, 'MAPE': 0.07201129458280883}


In [28]:
get_average_metrics(metrics_dict)

{'r2': 0.8991056980727458,
 'MAE': 0.0015396900563183138,
 'MSE': 4.5508204103537835e-06,
 'MAPE': 0.07207411998290249}

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

model = Lasso(alpha=0.1, fit_intercept=True)

In [None]:
metrics_dict = train(model, kf)

{'r2': 0.2565351482739987, 'MAE': 25389.614236947265, 'MSE': 2098186171.7538147, 'MAPE': 1.0632899170675162e+18}


  model = cd_fast.enet_coordinate_descent(


{'r2': 0.2427251635834795, 'MAE': 25591.705858659334, 'MSE': 1945771476.6312761, 'MAPE': 9.470652715349852e+17}
{'r2': 0.2623176425058168, 'MAE': 25511.916825866323, 'MSE': 2201992239.675188, 'MAPE': 9.013812445049292e+17}


In [None]:
get_average_metrics(metrics_dict)

{'r2': 0.25385931812109835,
 'MAE': 25497.745640490975,
 'MSE': 2081983296.020093,
 'MAPE': 9.705788110358102e+17}

In [44]:
df # catboost

Unnamed: 0,r2,MAE
houses.arff,0.847188,29578.008853
dataset_2183_cpu_act.arff,0.983986,1.59156
house_16H.arff,0.655133,15810.46931
dataset_2202_elevators.arff,0.899106,0.00154
