In [51]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    classification_report
)
import lightgbm as lgb
import numpy as np

## Dataset Preparation

In [52]:
NUM_FOLD = 12
train_df = pd.read_parquet("train_df_slogan_lp.parquet")
val_df = pd.read_parquet("val_df_slogan_lp.parquet")
test_df = pd.read_parquet("test_df_slogan_lp.parquet")

In [53]:
dataset = pd.read_parquet("result_df_gt_2_slogan_lp.parquet")

In [54]:
dataset

Unnamed: 0_level_0,apparato isolato,fermo parziale sito tec,None,alimentazione,signal degraded,loss of frame lof,guasto matrice,fermo parziale gsm,fermo totale gsm,fermo totale 5g,...,cardinitcard initializing,mpls tunnel rdi mpls tunnel rdi,vc unequipped ho vc unequipped,guasto controllore,vlan megaco ko,serversignalfailure odu ssf,unidentified specific problem non inviato da agent,replaceablemodulemissing eqpt,grave disservizio onuc,crdpwroff card power off xexc xexc
cluster_id2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
202301090504_2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202301090504_3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202301090506_14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202301090506_4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202301090506_5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202312200220_8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202312200222_2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202312200604_3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202312210200_6,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
cluster_with_guastocavo_labels = pd.read_csv(
    "20230101-20240101_real_time_clusters_filtered_guasto_cavo.csv"
)
merge_train_df = pd.merge(
    train_df, cluster_with_guastocavo_labels, on="cluster_id2", how="left"
)
merge_train_df.set_index("cluster_id2", inplace=True)
merge_val_df = pd.merge(
    val_df, cluster_with_guastocavo_labels, on="cluster_id2", how="left"
)
merge_val_df.set_index("cluster_id2", inplace=True)
merge_test_df = pd.merge(
    test_df, cluster_with_guastocavo_labels, on="cluster_id2", how="left"
)
merge_test_df.set_index("cluster_id2", inplace=True)

In [56]:
merged_df = pd.concat([merge_train_df, merge_val_df, merge_test_df])

In [57]:
shuffled_df = merged_df.sample(frac=1, random_state=7)

fold1 = shuffled_df.iloc[: len(shuffled_df) // NUM_FOLD]
fold2 = shuffled_df.iloc[len(shuffled_df) // NUM_FOLD : 2 * len(shuffled_df) // NUM_FOLD]
fold3 = shuffled_df.iloc[2 * len(shuffled_df) // NUM_FOLD : 3 * len(shuffled_df) // NUM_FOLD]
fold4 = shuffled_df.iloc[3 * len(shuffled_df) // NUM_FOLD : 4 * len(shuffled_df) // NUM_FOLD]
fold5 = shuffled_df.iloc[4 * len(shuffled_df) // NUM_FOLD : 5 * len(shuffled_df) // NUM_FOLD]
fold6 = shuffled_df.iloc[5 * len(shuffled_df) // NUM_FOLD : 6 * len(shuffled_df) // NUM_FOLD]
fold7 = shuffled_df.iloc[6 * len(shuffled_df) // NUM_FOLD : 7 * len(shuffled_df) // NUM_FOLD]
fold8 = shuffled_df.iloc[7 * len(shuffled_df) // NUM_FOLD : 8 * len(shuffled_df) // NUM_FOLD]
fold9 = shuffled_df.iloc[8 * len(shuffled_df) // NUM_FOLD : 9 * len(shuffled_df) // NUM_FOLD]
fold10 = shuffled_df.iloc[9 * len(shuffled_df) // NUM_FOLD : 10 * len(shuffled_df) // NUM_FOLD]
fold11 = shuffled_df.iloc[10 * len(shuffled_df) // NUM_FOLD : 11 * len(shuffled_df) // NUM_FOLD]
fold12 = shuffled_df.iloc[11 * len(shuffled_df) // NUM_FOLD :]

In [58]:
def print_readable_metrics(avg_metrics):
    print("Metriche Medie per Fold:\n")
    print("{:<10} {:<12} {:<12} {:<12} {:<12}".format('Label', 'Precision', 'Recall', 'F1-Score', 'Support'))
    for label, metrics in avg_metrics.items():
        if label != "accuracy":
            print("{:<10} {:<12.2f} {:<12.2f} {:<12.2f} {:<12}".format(
                label,
                metrics['precision'],
                metrics['recall'],
                metrics['f1-score'] ,
                int(metrics['support'])
            ))
    print("\nAccuracy: {:.2f}%".format(avg_metrics["accuracy"] * 100))

## Random Forest


In [59]:
results = []

for iter in range(NUM_FOLD):

    train_dfs = [
        fold for i, fold in enumerate([fold1, fold2, fold3, fold4, fold5, fold6, fold7, fold8, fold9, fold10, fold11, fold12]) if i != iter
    ]
    val_df = [fold1, fold2, fold3, fold4, fold5, fold6, fold7, fold8, fold9, fold10, fold11, fold12][iter]

    train_df = pd.concat(train_dfs)
    X_train = train_df.drop(columns=["GUASTO CAVO"])
    y_train = train_df["GUASTO CAVO"].values.ravel()

    X_val = val_df.drop(columns=["GUASTO CAVO"])
    y_val = val_df["GUASTO CAVO"]

    clf = RandomForestClassifier(n_estimators=800, random_state=42)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    results.append(classification_report(y_val, y_val_pred, output_dict=True))

sum_metrics = {
    key: {"precision": 0, "recall": 0, "f1-score": 0, "support": 0}
    for key in ["False", "True"]
}
sum_metrics["accuracy"] = 0

for report in results:
    for key in ["False", "True"]:
        for metric in sum_metrics[key]:
            sum_metrics[key][metric] += report[key][metric]
    sum_metrics["accuracy"] += report["accuracy"]

avg_metrics = {
    key: {metric: total / NUM_FOLD for metric, total in sum_metrics[key].items()}
    for key in ["False", "True"]
}
avg_metrics["accuracy"] = sum_metrics["accuracy"] / NUM_FOLD

print_readable_metrics(avg_metrics)

Metriche Medie per Fold:

Label      Precision    Recall       F1-Score     Support     
False      0.42         0.15         0.22         55          
True       0.84         0.96         0.89         253         

Accuracy: 81.04%


## Logistic Regression


In [60]:
results = []

for iter in range(NUM_FOLD):

    train_dfs = [
        fold for i, fold in enumerate([fold1, fold2, fold3, fold4, fold5, fold6, fold7, fold8, fold9, fold10, fold11, fold12]) if i != iter
    ]
    val_df = [fold1, fold2, fold3, fold4, fold5, fold6, fold7, fold8, fold9, fold10, fold11, fold12][iter]

    train_df = pd.concat(train_dfs)
    X_train = train_df.drop(columns=["GUASTO CAVO"])
    y_train = train_df["GUASTO CAVO"].values.ravel()

    X_val = val_df.drop(columns=["GUASTO CAVO"])
    y_val = val_df["GUASTO CAVO"]

    clf = LogisticRegression(random_state=42, max_iter=200)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    results.append(classification_report(y_val, y_val_pred, output_dict=True))
 
sum_metrics = {
    key: {"precision": 0, "recall": 0, "f1-score": 0, "support": 0}
    for key in ["False", "True"]
}
sum_metrics["accuracy"] = 0
   
for report in results:
    for key in ["False", "True"]:
        for metric in sum_metrics[key]:
            sum_metrics[key][metric] += report[key][metric]
    sum_metrics["accuracy"] += report["accuracy"]

avg_metrics = {
    key: {metric: total / NUM_FOLD for metric, total in sum_metrics[key].items()}
    for key in ["False", "True"]
}
avg_metrics["accuracy"] = sum_metrics["accuracy"] / NUM_FOLD

print_readable_metrics(avg_metrics)

Metriche Medie per Fold:

Label      Precision    Recall       F1-Score     Support     
False      0.63         0.06         0.10         55          
True       0.83         0.99         0.90         253         

Accuracy: 82.44%


## Support Vector Machines


In [61]:
results = []


for iter in range(NUM_FOLD):

    train_dfs = [
        fold for i, fold in enumerate([fold1, fold2, fold3, fold4, fold5, fold6, fold7, fold8, fold9, fold10, fold11, fold12]) if i != iter
    ]
    val_df = [fold1, fold2, fold3, fold4, fold5, fold6, fold7, fold8, fold9, fold10, fold11, fold12][iter]

    train_df = pd.concat(train_dfs)
    X_train = train_df.drop(columns=["GUASTO CAVO"])
    y_train = train_df["GUASTO CAVO"].values.ravel()

    X_val = val_df.drop(columns=["GUASTO CAVO"])
    y_val = val_df["GUASTO CAVO"]

    clf = SVC(kernel="linear", random_state=42)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    results.append(classification_report(y_val, y_val_pred, output_dict=True))
 
sum_metrics = {
    key: {"precision": 0, "recall": 0, "f1-score": 0, "support": 0}
    for key in ["False", "True"]
}
sum_metrics["accuracy"] = 0
   
for report in results:
    for key in ["False", "True"]:
        for metric in sum_metrics[key]:
            sum_metrics[key][metric] += report[key][metric]
    sum_metrics["accuracy"] += report["accuracy"]

avg_metrics = {
    key: {metric: total / NUM_FOLD for metric, total in sum_metrics[key].items()}
    for key in ["False", "True"]
}
avg_metrics["accuracy"] = sum_metrics["accuracy"] / NUM_FOLD

print_readable_metrics(avg_metrics)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Metriche Medie per Fold:

Label      Precision    Recall       F1-Score     Support     
False      0.68         0.03         0.06         55          
True       0.82         1.00         0.90         253         

Accuracy: 82.45%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Gradient Boosting Machines


In [62]:
results = []
# Definizione dei parametri
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 0.9,
    "force_row_wise": True,
}

num_round = 2000


for iter in range(NUM_FOLD):

    train_dfs = [
        fold for i, fold in enumerate([fold1, fold2, fold3, fold4, fold5, fold6, fold7, fold8, fold9, fold10, fold11, fold12]) if i != iter
    ]
    val_df = [fold1, fold2, fold3, fold4, fold5, fold6, fold7, fold8, fold9, fold10, fold11, fold12][iter]

    train_df = pd.concat(train_dfs)
    X_train = train_df.drop(columns=["GUASTO CAVO"])
    y_train = train_df["GUASTO CAVO"].values.ravel()

    X_val = val_df.drop(columns=["GUASTO CAVO"])
    y_val = val_df["GUASTO CAVO"]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    
    bst = lgb.train(params, train_data, num_round)
    y_pred = bst.predict(X_val, num_iteration=bst.best_iteration)
    y_val_pred = np.round(y_pred)
    results.append(classification_report(y_val, y_val_pred, output_dict=True))
 
sum_metrics = {
    key: {"precision": 0, "recall": 0, "f1-score": 0, "support": 0}
    for key in ["False", "True"]
}
sum_metrics["accuracy"] = 0
   
for report in results:
    for key in ["False", "True"]:
        for metric in sum_metrics[key]:
            sum_metrics[key][metric] += report[key][metric]
    sum_metrics["accuracy"] += report["accuracy"]

avg_metrics = {
    key: {metric: total / NUM_FOLD for metric, total in sum_metrics[key].items()}
    for key in ["False", "True"]
}
avg_metrics["accuracy"] = sum_metrics["accuracy"] / NUM_FOLD

print_readable_metrics(avg_metrics)

[LightGBM] [Info] Number of positive: 2795, number of negative: 610
[LightGBM] [Info] Total Bins 138
[LightGBM] [Info] Number of data points in the train set: 3405, number of used features: 69
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.820852 -> initscore=1.522128
[LightGBM] [Info] Start training from score 1.522128
[LightGBM] [Info] Number of positive: 2802, number of negative: 602
[LightGBM] [Info] Total Bins 134
[LightGBM] [Info] Number of data points in the train set: 3404, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.823149 -> initscore=1.537831
[LightGBM] [Info] Start training from score 1.537831
[LightGBM] [Info] Number of positive: 2794, number of negative: 611
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 3405, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.820558 -> initscore=1.520133
[LightGBM] [Info] Start training from score 1.520133
[LightGBM] [Info] Number 