# Flat DataFrame Creation for TSFresh

In [1]:
import os

path_data = {}

path_data["train"] = [os.path.join(root, file) for root, dirs, files in os.walk(
    "DataSet/trainset") for file in files]  # all train csv file paths
path_data["val"] = [os.path.join(root, file) for root, dirs, files in os.walk(
    "DataSet/validationset") for file in files]  # all val csv file paths
path_data["test"] = [os.path.join(root, file) for root, dirs, files in os.walk(
    "DataSet/testset") for file in files]  # all test csv file paths

In [2]:
import pandas as pd
import numpy as np

# combined list of all paths
all_paths = path_data["train"] + path_data["val"] + path_data["test"]

# dframe to store all X data
X = pd.DataFrame(columns=["filepath", "user_id", "time", "indicator1", "indicator2", "indicator3", 
                          "indicator4", "indicator5", "indicator6", "indicator7", "indicator8"])
# array to store all y data
y = []

for user_idx, f_path in enumerate(all_paths):
    data_df = pd.read_csv(f_path).iloc[:, 3:]  # gets X data for current user csv (excluding first 3 columns)
    data_df["filepath"] = f_path  # adds file path to df
    data_df["user_id"] = int(user_idx)  # adds unique user id
    data_df['time'] = data_df.index  # gets index as time stamp (tsfresh)
    X = pd.concat([X, data_df])  # adds to X dframe
    
    # gets y variable from file path
    if "UGE" in f_path:
        y.append(1)
    elif "UBE" in f_path:
        y.append(0)
    
X = X.infer_objects()  # infers dtypes of X dframe
y = np.array(y)  # converts y list to ndarray

In [9]:
X

Unnamed: 0,filepath,user_id,time,indicator1,indicator2,indicator3,indicator4,indicator5,indicator6,indicator7,indicator8
0,DataSet/trainset/UGE/user1.csv,0,0,46.0,910.5,3.50,47.67,48.0,52.0,0.2745,0.0750
1,DataSet/trainset/UGE/user1.csv,0,1,0.0,0.0,0.00,0.00,0.0,0.0,0.0000,0.4286
2,DataSet/trainset/UGE/user1.csv,0,2,5.0,6.0,2.67,4.67,0.0,4.0,0.0000,0.0000
3,DataSet/trainset/UGE/user1.csv,0,3,46.0,1971.0,7.33,46.00,7.0,46.0,0.1000,0.0000
4,DataSet/trainset/UGE/user1.csv,0,4,25.0,3.0,9.00,25.00,3.0,48.0,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...
23958,DataSet/testset/UBE/user248.csv,499,23958,21.0,130.0,1.00,21.00,63.0,21.0,0.1538,0.0000
23959,DataSet/testset/UBE/user248.csv,499,23959,0.0,0.0,0.00,160.00,62.0,22.0,0.0000,0.0000
23960,DataSet/testset/UBE/user248.csv,499,23960,4.0,249.0,5.00,5.00,252.0,5.0,0.0000,0.0000
23961,DataSet/testset/UBE/user248.csv,499,23961,11.0,65.0,8.00,10.00,42.0,33.0,0.0526,0.0000


In [10]:
y.shape

(500,)

# TSFresh Feature Extraction

In [11]:
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute

# extracts tsfresh features
extracted_features = extract_features(X.iloc[:, 1:],  # process without filepath column
                                      column_id="user_id",
                                      column_sort="time",
                                      default_fc_parameters=EfficientFCParameters(),  # efficient parameters to cut down computation time
                                      impute_function=impute)

Feature Extraction: 100%|██████████| 20/20 [3:03:53<00:00, 551.68s/it]  


In [12]:
extracted_features.head()

Unnamed: 0,indicator1__variance_larger_than_standard_deviation,indicator1__has_duplicate_max,indicator1__has_duplicate_min,indicator1__has_duplicate,indicator1__sum_values,indicator1__abs_energy,indicator1__mean_abs_change,indicator1__mean_change,indicator1__mean_second_derivative_central,indicator1__median,...,indicator8__permutation_entropy__dimension_6__tau_1,indicator8__permutation_entropy__dimension_7__tau_1,indicator8__query_similarity_count__query_None__threshold_0.0,"indicator8__matrix_profile__feature_""min""__threshold_0.98","indicator8__matrix_profile__feature_""max""__threshold_0.98","indicator8__matrix_profile__feature_""mean""__threshold_0.98","indicator8__matrix_profile__feature_""median""__threshold_0.98","indicator8__matrix_profile__feature_""25""__threshold_0.98","indicator8__matrix_profile__feature_""75""__threshold_0.98",indicator8__mean_n_absolute_max__number_of_maxima_7
0,1.0,0.0,1.0,1.0,56425.82,4426622.0,20.551836,-0.005151,0.013581,23.73,...,4.655726,5.615643,0.0,2.492759,5.15215,4.230855,4.275636,4.004746,4.520421,0.472286
1,1.0,0.0,1.0,1.0,278008.73,1055541000.0,24.88125,0.000334,0.000204,16.6,...,1.417866,1.709721,0.0,4.974313,19.317951,16.271742,17.079996,15.681324,17.894127,0.4737
2,1.0,0.0,1.0,1.0,313018.09,21431410000.0,84.307619,0.001619,-8.1e-05,12.0,...,2.261113,2.73403,0.0,4.694198,24.345355,20.799397,21.931388,19.671169,22.981645,0.464629
3,1.0,0.0,1.0,1.0,756723.16,3622832000.0,45.872286,-0.000736,0.000177,13.175,...,2.818944,3.434586,0.0,3.38785,13.842438,10.034405,10.354981,8.872922,11.497783,0.471629
4,1.0,0.0,1.0,1.0,317934.1,407941200.0,10.449819,0.000205,0.000105,10.0,...,0.343271,0.409839,0.0,25.598598,79.276272,70.001092,71.41966,68.540246,75.580976,0.419714


In [13]:
# tsfresh extracted features description
extracted_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Columns: 6264 entries, indicator1__variance_larger_than_standard_deviation to indicator8__mean_n_absolute_max__number_of_maxima_7
dtypes: float64(6264)
memory usage: 23.9 MB


In [16]:
# save TSFresh extracted features dataframe
extracted_features.to_pickle("tsfresh_extracted_features.pickle")

In [17]:
# save y variable
np.save("tsfresh_y.npy", y)

# XGBoost (Baseline)

In [18]:
# train/val/test split
X_train = extracted_features.iloc[:300, :]
X_val = extracted_features.iloc[300:400, :]
X_test = extracted_features.iloc[400:500, :]

y_train = y[:300]
y_val = y[300:400]
y_test = y[400:500]

In [19]:
from xgboost import XGBClassifier

# xgboost baseline
model = XGBClassifier()
model.fit(X_train, y_train)

In [20]:
from sklearn.metrics import classification_report

# predictions on validation set
y_pred = model.predict(X_val, ntree_limit = model.best_ntree_limit)

# validation set classification report
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.56      0.54        50
           1       0.53      0.50      0.52        50

    accuracy                           0.53       100
   macro avg       0.53      0.53      0.53       100
weighted avg       0.53      0.53      0.53       100





In [21]:
# predictions on test set
y_infer = model.predict(X_test, ntree_limit = model.best_ntree_limit)

# validation set classification report
print(classification_report(y_test, y_infer))

              precision    recall  f1-score   support

           0       0.53      0.66      0.59        50
           1       0.55      0.42      0.48        50

    accuracy                           0.54       100
   macro avg       0.54      0.54      0.53       100
weighted avg       0.54      0.54      0.53       100





In [22]:
# save model to JSON
model.save_model("tsfresh_xgboost_baseline.json")

# XGBoost + PCA + Min-Max Normalization

In [25]:
from sklearn.preprocessing import MinMaxScaler

# applys min-max normalization
scaler = MinMaxScaler()
data_rescaled = pd.DataFrame(scaler.fit_transform(extracted_features.values),
                             columns=extracted_features.columns,
                             index=extracted_features.index)

In [26]:
data_rescaled.head()

Unnamed: 0,indicator1__variance_larger_than_standard_deviation,indicator1__has_duplicate_max,indicator1__has_duplicate_min,indicator1__has_duplicate,indicator1__sum_values,indicator1__abs_energy,indicator1__mean_abs_change,indicator1__mean_change,indicator1__mean_second_derivative_central,indicator1__median,...,indicator8__permutation_entropy__dimension_6__tau_1,indicator8__permutation_entropy__dimension_7__tau_1,indicator8__query_similarity_count__query_None__threshold_0.0,"indicator8__matrix_profile__feature_""min""__threshold_0.98","indicator8__matrix_profile__feature_""max""__threshold_0.98","indicator8__matrix_profile__feature_""mean""__threshold_0.98","indicator8__matrix_profile__feature_""median""__threshold_0.98","indicator8__matrix_profile__feature_""25""__threshold_0.98","indicator8__matrix_profile__feature_""75""__threshold_0.98",indicator8__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,0.0,0.0,0.001491,1e-06,0.001747,0.633104,0.601,0.964634,...,1.0,0.999872,0.0,0.020688,0.0,0.001627,0.002156,0.008936,0.0,0.963355
1,0.0,0.0,0.0,0.0,0.010295,0.000423,0.002496,0.636331,0.589862,0.674797,...,0.258607,0.259065,0.0,0.073231,0.191109,0.184404,0.192445,0.188252,0.188202,0.966398
2,0.0,0.0,0.0,0.0,0.011686,0.008594,0.012773,0.637088,0.589625,0.487805,...,0.451691,0.453338,0.0,0.0673,0.258933,0.253132,0.264543,0.249523,0.259796,0.946877
3,0.0,0.0,0.0,0.0,0.029316,0.001452,0.006126,0.635702,0.58984,0.535569,...,0.579421,0.586207,0.0,0.03964,0.11724,0.089723,0.092503,0.083696,0.098189,0.96194
4,0.0,0.0,0.0,0.0,0.011881,0.000163,0.0,0.636256,0.58978,0.406504,...,0.012551,0.012526,0.0,0.509916,1.0,1.0,1.0,1.0,1.0,0.850221


In [132]:
from sklearn.decomposition import PCA

# applys PCA
pca = PCA(n_components=0.95)
reduced_data = pd.DataFrame(pca.fit_transform(data_rescaled))

In [133]:
reduced_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,387,388,389,390,391,392,393,394,395,396
0,-1.925729,-1.630144,-0.905216,-0.303874,0.400261,0.860415,0.035264,0.963711,-0.580948,-0.478045,...,0.316132,0.191492,0.203838,-0.277691,-0.6957,0.142555,0.027108,0.09682,-0.229293,0.287987
1,2.500573,-0.451125,1.839606,0.634731,-0.117823,-0.56447,1.079137,0.481985,-0.417139,-0.246845,...,-0.033575,0.224289,0.178397,-0.123016,-0.572852,0.183357,-0.54144,0.147646,-0.449401,-0.164491
2,-2.62402,-1.801594,-1.33741,-1.168962,-0.795646,-0.355878,-0.309051,1.109821,1.759091,-2.403549,...,-0.288909,0.171892,-0.180467,-0.001054,-0.053769,-0.65921,-0.230125,-0.087473,0.024347,-0.169916
3,2.662198,1.696272,-0.962033,-0.54971,1.080219,-1.132137,-0.715902,1.092263,1.342117,0.455166,...,0.003626,0.105044,-0.112859,0.390398,-0.220451,-0.19644,0.168918,0.164115,0.304205,0.133395
4,-1.705261,-1.543804,1.003125,-0.162001,-2.038776,-1.90292,0.366085,-0.011662,0.391175,-0.493964,...,0.395367,-0.018489,-0.204482,-0.043257,-0.084022,-0.231142,-0.00777,-0.284879,-0.145712,0.41976


In [134]:
# train/val/test split
X_train = reduced_data.iloc[:300, :]
X_val = reduced_data.iloc[300:400, :]
X_test = reduced_data.iloc[400:500, :]

y_train = y[:300]
y_val = y[300:400]
y_test = y[400:500]

In [139]:
from xgboost import XGBClassifier

# xgboost baseline
model = XGBClassifier()
model.fit(X_train, y_train)

In [141]:
# predictions on testing set
y_infer = model.predict(X_test, ntree_limit = model.best_ntree_limit)

# validation set classification report
print(classification_report(y_test, y_infer))

              precision    recall  f1-score   support

           0       0.66      0.62      0.64        50
           1       0.64      0.68      0.66        50

    accuracy                           0.65       100
   macro avg       0.65      0.65      0.65       100
weighted avg       0.65      0.65      0.65       100





In [142]:
# save model to JSON
model.save_model("tsfresh_xgboost_baseline_pca.json")