In [1]:
import os
import pandas as pd

# Data Preprocess

## Read Data

In [2]:
base_folder = 'data/trace_201708/'
df_rolled = pd.read_csv(os.path.join(base_folder, 'df_rolled_12.csv'))
features = pd.read_csv(os.path.join(base_folder, 'extracted_features_12_106.csv'))
labels = pd.read_csv(os.path.join(base_folder, 'labels_12_106.csv'))

In [3]:
df_rolled.shape, features.shape, labels.shape

((172243, 14), (172243, 108), (172243, 3))

## Split Data

In [4]:
indices = list(range(df_rolled.shape[0]))

from sklearn.model_selection import train_test_split
train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)

In [5]:
train_rolled = df_rolled.iloc[train_indices, 2:].values
train_labels = labels.iloc[train_indices, 2:].values
train_features = features.iloc[train_indices, 2:].values

test_rolled = df_rolled.iloc[test_indices, 2:].values
test_labels = labels.iloc[test_indices, 2:].values
test_features = features.iloc[test_indices, 2:].values

#

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [7]:
def cal_mae_and_rmse(preds):
    rmse = mean_squared_error(preds, test_labels.reshape(-1), squared=False)
    mae = mean_absolute_error(preds, test_labels.reshape(-1))
    return rmse, mae

In [8]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression().fit(train_rolled, train_labels.reshape(-1))
lr_preds = lr_model.predict(test_rolled).reshape(-1)

cal_mae_and_rmse(lr_preds)


(4.2173217517683455, 3.201971369031407)

In [9]:
from sklearn.svm import SVR

svm_model = SVR(max_iter=10000).fit(train_rolled, train_labels.reshape(-1))
svm_preds = svm_model.predict(test_rolled).reshape(-1)

cal_mae_and_rmse(svm_preds)



(6.007801623069151, 4.928788550227052)

In [10]:
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor(random_state=42).fit(train_rolled, train_labels.reshape(-1))
gb_preds = gb_model.predict(test_rolled).reshape(-1)

cal_mae_and_rmse(gb_preds)

(3.864784760570757, 2.9205734275326143)

In [11]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
kernel = DotProduct() + WhiteKernel()
fit_num = 1000
gpr_model = GaussianProcessRegressor(kernel=kernel, random_state=42).fit(train_rolled[:fit_num], train_labels[:fit_num].reshape(-1))
gpr_preds = gpr_model.predict(test_rolled).reshape(-1)

cal_mae_and_rmse(gpr_preds)

(4.275240877472578, 3.2570387101339002)

In [12]:
import numpy as np

In [13]:
lr_p = lr_model.predict(train_rolled).reshape(-1)

In [14]:
svm_p = svm_model.predict(train_rolled).reshape(-1)

In [15]:
gb_p = gb_model.predict(train_rolled).reshape(-1)

In [16]:
gpr_p = gpr_model.predict(train_rolled).reshape(-1)

In [17]:
# lr_p = lr_model.predict(train_rolled).reshape(-1)
# svm_p = svm_model.predict(train_rolled).reshape(-1)
# gb_p = gb_model.predict(train_rolled).reshape(-1)
# gpr_p = gpr_model.predict(train_rolled).reshape(-1)
identified_method = np.abs(np.stack([lr_p, svm_p, gb_p, gpr_p], axis=1) - train_labels).argmin(axis=1).reshape(-1)

In [49]:
identified_method_test = np.abs(np.stack([lr_preds, svm_preds, gb_preds, gpr_preds], axis=1) - test_labels).argmin(axis=1).reshape(-1)

array([2, 2, 2, ..., 1, 3, 2])

In [48]:
# df_features_train = features.iloc[train_indices, :]
# df_rolled_train = df_rolled.iloc[train_indices, :]
# df_labels_train = labels.iloc[train_indices, :]

# df_features_test = features.iloc[test_indices, :]
# df_rolled_test = df_rolled.iloc[test_indices, :]
# df_labels_test = labels.iloc[test_indices, :]

In [53]:
# df_features_train['method'] = identified_method

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features_train['method'] = identified_method


## Train AMS

In [56]:
# train_features = df_features_train.iloc[:, 2:-1].values
# train_methods = df_features_train.iloc[:, -1].values

In [29]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, random_state=42)
clf.fit(train_features, identified_method)

In [31]:
test_method = clf.predict(test_features)

In [77]:
from sklearn.metrics import accuracy_score
print(f"accuracy score is {accuracy_score(identified_method_test, test_method)}")
from sklearn.metrics import precision_score
print(f"precision score is {precision_score(identified_method_test, test_method, average='micro')}")
from sklearn.metrics import recall_score
print(f"recall score is {recall_score(identified_method_test, test_method, average='micro')}")
from sklearn.metrics import f1_score
print(f"f1 score is {f1_score(identified_method_test, test_method, average='micro')}")

accuracy score is 0.6584806525588551
precision score is 0.6584806525588551
recall score is 0.6584806525588551
f1 score is 0.6584806525588551


In [84]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(identified_method_test, test_method)

tp = np.diagonal(cm)
fp = cm.sum(axis=0) - tp
fn = cm.sum(axis=1) - tp
tn = cm.sum() - (tp + fp + fn)

tp = tp.sum()
fp = fp.sum()
fn = fn.sum()
tn = tn.sum()

tpr = tp / (tp + fn)
tnr = tn / (fp + tn)
fpr = fp / (fp + tn)
fnr = fn / (tp + fn)

accuracy = np.sum(tp) / np.sum(cm)

print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("True Positive Rate:", tpr)
print("False Positive Rate:", fpr)
print("True Negative Rate:", tnr)
print("False Negative Rate:", fnr)
print("Accuracy:", accuracy)

True Positives: 22684
True Negatives: 91582
False Positives: 11765
False Negatives: 11765
True Positive Rate: 0.6584806525588551
False Positive Rate: 0.11383978248038162
True Negative Rate: 0.8861602175196184
False Negative Rate: 0.34151934744114487
Accuracy: 0.6584806525588551


In [63]:
test_method_onehot = np.eye(4)[test_method]
identified_method_test_onehot = np.eye(4)[identified_method_test]

In [64]:
from sklearn.metrics import roc_curve, auc
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(4):
    fpr[i], tpr[i], _ = roc_curve(identified_method_test_onehot[:, i], test_method_onehot[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i]) 

In [68]:
roc_curve(identified_method_test_onehot[:, 0], test_method_onehot[:, 0])

(array([0.        , 0.05573254, 1.        ]),
 array([0.        , 0.37491502, 1.        ]),
 array([2., 1., 0.]))

In [75]:
from sklearn.metrics import f1_score
f1_score(identified_method_test_onehot, test_method_onehot, average='micro')


0.6584806525588551

In [32]:
test_labels

array([[37.14000053],
       [25.6       ],
       [39.        ],
       ...,
       [30.56000023],
       [10.1       ],
       [ 2.04000001]])

In [33]:
method_mapping = {
    0: 'lr_model',
    1: 'svm_model',
    2: 'gb_model',
    3: 'gpr_model',
}

In [46]:
preds = []
# cnt = 0
for idx, mid in enumerate(test_method):
    m = method_mapping[mid]
    preds.append(eval(m).predict(test_rolled[idx:idx+1])[0])
    # print(m)
    # print(preds[-1])
    # cnt += 1
    # if cnt == 5: break


In [47]:
preds = np.array(preds, dtype="float64")

In [40]:
eval(method_mapping[1]).predict(test_rolled[1:2])

array([19.55963268])

In [39]:
preds[:10], test_labels[:10].reshape(-1)

(array([35.46302906, 35.46302906, 29.09794276, 29.09794276, 35.46302906,
        35.46302906, 29.09794276, 19.55963268, 17.35624317, 35.46302906]),
 array([37.14000053, 25.6       , 39.        , 18.67999992, 35.6       ,
        32.08000031, 47.63999939, 28.88000031, 13.06000023, 39.68000031]))

In [48]:
cal_mae_and_rmse(preds)

(3.3361972737711034, 2.3122183815733344)