In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report
)
import lightgbm as lgb
import numpy as np

## Dataset Preparation

In [2]:
NUM_FOLD = 5
train_df = pd.read_parquet("train_df_link_lp.parquet")
val_df = pd.read_parquet("val_df_link_lp.parquet")
test_df = pd.read_parquet("test_df_link_lp.parquet")

In [3]:
cluster_with_guastocavo_labels = pd.read_csv(
    "20230101-20240101_real_time_clusters_filtered_guasto_cavo.csv"
)
merge_train_df = pd.merge(
    train_df, cluster_with_guastocavo_labels, on="cluster_id2", how="left"
)
merge_train_df.set_index("cluster_id2", inplace=True)
merge_val_df = pd.merge(
    val_df, cluster_with_guastocavo_labels, on="cluster_id2", how="left"
)
merge_val_df.set_index("cluster_id2", inplace=True)
merge_test_df = pd.merge(
    test_df, cluster_with_guastocavo_labels, on="cluster_id2", how="left"
)
merge_test_df.set_index("cluster_id2", inplace=True)

In [4]:
merged_df = pd.concat([merge_train_df, merge_val_df, merge_test_df])
shuffled_df = merged_df.sample(frac=1, random_state=7)

fold1 = shuffled_df.iloc[: len(shuffled_df) // NUM_FOLD]
fold2 = shuffled_df.iloc[
    len(shuffled_df) // NUM_FOLD : 2 * len(shuffled_df) // NUM_FOLD
]
fold3 = shuffled_df.iloc[
    2 * len(shuffled_df) // NUM_FOLD : 3 * len(shuffled_df) // NUM_FOLD
]
fold4 = shuffled_df.iloc[
    3 * len(shuffled_df) // NUM_FOLD : 4 * len(shuffled_df) // NUM_FOLD
]
fold5 = shuffled_df.iloc[4 * len(shuffled_df) // NUM_FOLD :]

In [5]:
def print_readable_metrics(avg_metrics):
    print("Metriche Medie per Fold:\n")
    print("{:<10} {:<12} {:<12} {:<12} {:<12}".format('Label', 'Precision', 'Recall', 'F1-Score', 'Support'))
    for label, metrics in avg_metrics.items():
        if label != "accuracy":
            print("{:<10} {:<12.2f} {:<12.2f} {:<12.2f} {:<12}".format(
                label,
                metrics['precision'],
                metrics['recall'],
                metrics['f1-score'] ,
                int(metrics['support'])
            ))
    print("\nAccuracy: {:.2f}%".format(avg_metrics["accuracy"] * 100))

## Random Forest


In [16]:
results = []

for iter in range(NUM_FOLD):

    train_dfs = [
        fold for i, fold in enumerate([fold1, fold2, fold3, fold4, fold5]) if i != iter
    ]
    val_df = [fold1, fold2, fold3, fold4, fold5][iter]

    train_df = pd.concat(train_dfs)
    X_train = train_df.drop(columns=["GUASTO CAVO"])
    y_train = train_df["GUASTO CAVO"].values.ravel()

    X_val = val_df.drop(columns=["GUASTO CAVO"])
    y_val = val_df["GUASTO CAVO"]

    clf = RandomForestClassifier(n_estimators=400, random_state=42)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    results.append(classification_report(y_val, y_val_pred, output_dict=True))

sum_metrics = {
    key: {"precision": 0, "recall": 0, "f1-score": 0, "support": 0}
    for key in ["False", "True"]
}
sum_metrics["accuracy"] = 0

for report in results:
    for key in ["False", "True"]:
        for metric in sum_metrics[key]:
            sum_metrics[key][metric] += report[key][metric]
    sum_metrics["accuracy"] += report["accuracy"]

avg_metrics = {
    key: {metric: total / NUM_FOLD for metric, total in sum_metrics[key].items()}
    for key in ["False", "True"]
}
avg_metrics["accuracy"] = sum_metrics["accuracy"] / NUM_FOLD

print_readable_metrics(avg_metrics)

Metriche Medie per Fold:

Label      Precision    Recall       F1-Score     Support     
False      0.56         0.27         0.37         253         
True       0.85         0.95         0.90         1081        

Accuracy: 82.06%


## Logistic Regression


In [12]:
results = []

for iter in range(NUM_FOLD):

    train_dfs = [
        fold for i, fold in enumerate([fold1, fold2, fold3, fold4, fold5]) if i != iter
    ]
    val_df = [fold1, fold2, fold3, fold4, fold5][iter]

    train_df = pd.concat(train_dfs)
    X_train = train_df.drop(columns=["GUASTO CAVO"])
    y_train = train_df["GUASTO CAVO"].values.ravel()

    X_val = val_df.drop(columns=["GUASTO CAVO"])
    y_val = val_df["GUASTO CAVO"]

    clf = LogisticRegression(random_state=42, max_iter=200)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    results.append(classification_report(y_val, y_val_pred, output_dict=True))
 
sum_metrics = {
    key: {"precision": 0, "recall": 0, "f1-score": 0, "support": 0}
    for key in ["False", "True"]
}
sum_metrics["accuracy"] = 0
   
for report in results:
    for key in ["False", "True"]:
        for metric in sum_metrics[key]:
            sum_metrics[key][metric] += report[key][metric]
    sum_metrics["accuracy"] += report["accuracy"]

avg_metrics = {
    key: {metric: total / NUM_FOLD for metric, total in sum_metrics[key].items()}
    for key in ["False", "True"]
}
avg_metrics["accuracy"] = sum_metrics["accuracy"] / NUM_FOLD

print_readable_metrics(avg_metrics)

Metriche Medie per Fold:

Label      Precision    Recall       F1-Score     Support     
False      0.61         0.17         0.26         253         
True       0.83         0.97         0.90         1081        

Accuracy: 82.18%


## Support Vector Machines


In [13]:
results = []


for iter in range(NUM_FOLD):

    train_dfs = [
        fold for i, fold in enumerate([fold1, fold2, fold3, fold4, fold5]) if i != iter
    ]
    val_df = [fold1, fold2, fold3, fold4, fold5][iter]

    train_df = pd.concat(train_dfs)
    X_train = train_df.drop(columns=["GUASTO CAVO"])
    y_train = train_df["GUASTO CAVO"].values.ravel()

    X_val = val_df.drop(columns=["GUASTO CAVO"])
    y_val = val_df["GUASTO CAVO"]

    clf = SVC(kernel="linear", random_state=42)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    results.append(classification_report(y_val, y_val_pred, output_dict=True))
 
sum_metrics = {
    key: {"precision": 0, "recall": 0, "f1-score": 0, "support": 0}
    for key in ["False", "True"]
}
sum_metrics["accuracy"] = 0
   
for report in results:
    for key in ["False", "True"]:
        for metric in sum_metrics[key]:
            sum_metrics[key][metric] += report[key][metric]
    sum_metrics["accuracy"] += report["accuracy"]

avg_metrics = {
    key: {metric: total / NUM_FOLD for metric, total in sum_metrics[key].items()}
    for key in ["False", "True"]
}
avg_metrics["accuracy"] = sum_metrics["accuracy"] / NUM_FOLD

print_readable_metrics(avg_metrics)

Metriche Medie per Fold:

Label      Precision    Recall       F1-Score     Support     
False      0.57         0.13         0.21         253         
True       0.83         0.98         0.90         1081        

Accuracy: 81.54%


## Gradient Boosting Machines


In [15]:
results = []
# Definizione dei parametri
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 0.9,
    "force_row_wise": True,
}

num_round = 2000


for iter in range(NUM_FOLD):

    train_dfs = [
        fold for i, fold in enumerate([fold1, fold2, fold3, fold4, fold5]) if i != iter
    ]
    val_df = [fold1, fold2, fold3, fold4, fold5][iter]

    train_df = pd.concat(train_dfs)
    X_train = train_df.drop(columns=["GUASTO CAVO"])
    y_train = train_df["GUASTO CAVO"].values.ravel()

    X_val = val_df.drop(columns=["GUASTO CAVO"])
    y_val = val_df["GUASTO CAVO"]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    
    bst = lgb.train(params, train_data, num_round)
    y_pred = bst.predict(X_val, num_iteration=bst.best_iteration)
    y_val_pred = np.round(y_pred)
    results.append(classification_report(y_val, y_val_pred, output_dict=True))
 
sum_metrics = {
    key: {"precision": 0, "recall": 0, "f1-score": 0, "support": 0}
    for key in ["False", "True"]
}
sum_metrics["accuracy"] = 0
   
for report in results:
    for key in ["False", "True"]:
        for metric in sum_metrics[key]:
            sum_metrics[key][metric] += report[key][metric]
    sum_metrics["accuracy"] += report["accuracy"]

avg_metrics = {
    key: {metric: total / NUM_FOLD for metric, total in sum_metrics[key].items()}
    for key in ["False", "True"]
}
avg_metrics["accuracy"] = sum_metrics["accuracy"] / NUM_FOLD

print_readable_metrics(avg_metrics)

[LightGBM] [Info] Number of positive: 4318, number of negative: 1021
[LightGBM] [Info] Total Bins 686
[LightGBM] [Info] Number of data points in the train set: 5339, number of used features: 343
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.808766 -> initscore=1.442010
[LightGBM] [Info] Start training from score 1.442010
[LightGBM] [Info] Number of positive: 4320, number of negative: 1018
[LightGBM] [Info] Total Bins 692
[LightGBM] [Info] Number of data points in the train set: 5338, number of used features: 346
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.809292 -> initscore=1.445415
[LightGBM] [Info] Start training from score 1.445415
[LightGBM] [Info] Number of positive: 4327, number of negative: 1012
[LightGBM] [Info] Total Bins 698
[LightGBM] [Info] Number of data points in the train set: 5339, number of used features: 349
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.810451 -> initscore=1.452946
[LightGBM] [Info] Start training from score 1.452946
[LightGBM] [Info] N