In [1]:
from utils.processing import *
import numpy as np
import yaml
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# First Trials

In [2]:
with open("config/config.yaml", 'r') as ymlfile:
    config = yaml.safe_load(ymlfile)
    
LABEL_NAMES = config['LABEL_NAMES']
RANDOM_STATE = config['RANDOM_STATE']

In [3]:

# Load

OUTPUT_DIR = Path("model/")
print("\n[1] LOADING DATA")
df_train, df_eval = load_file("data/development.csv", "data/evaluation.csv",sep=",", dtype = {"id": str, "page_rank": int, "label": int})
print(f"  Train: {len(df_train):,} | Eval: {len(df_eval):,}")

# Preprocess
print("\n[2] PREPROCESSING")
df_train = preprocessing_data(df_train)
print(f"  Puliti: {len(df_train):,}")

# Features
print("\n[3] FEATURE ENGINEERING")
df_train = generate_features(df_train, config=config)
df_eval = generate_features(df_eval, config=config)

# Split
X = df_train.drop(columns=['label'])
y = df_train['label']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print(f"\n  Train: {len(X_train):,} | Val: {len(X_val):,}")

# Pipeline
pipe = FullPipeline(config=config)
X_train_proc, y_train_proc = pipe.fit_transform(X_train, y_train)
X_val_proc = pipe.transform(X_val)
feature_names = pipe.feature_names_


import sys
sys.exit()


# TUNING
try:
    import pickle
    print("\n[4] LOADING TUNED MODELS AND RESULTS")
    for model in ['lgbm','svc']:
        with open(f"model/best_model_{model}.pkl", "rb") as f:
            if model == 'lgbm':
                best_lgbm = pickle.load(f)
            else:
                best_svc = pickle.load(f)
        with open(f"model/best_params_{model}.pkl", "rb") as f:
            if model == 'lgbm':
                lgbm_params = pickle.load(f)
            else:
                svc_params = pickle.load(f)
        if model == 'lgbm':
            lgbm_results = pd.read_csv(f"model/lgbm_tuning_results.csv")
        else:
            svc_results = pd.read_csv(f"model/svc_tuning_results.csv")
except:
    best_svc, svc_params, svc_results = tune_linear_svc(X_train_proc, y_train_proc, cv=5)
    best_lgbm, lgbm_params, lgbm_results = tune_lightgbm(X_train_proc, y_train_proc, cv=5)
    
    



# EVALUATION
print("\n" + "="*70)
print("FINAL EVALUATION ON VALIDATION SET")
print("="*70)

results = {}

y_pred_svc = best_svc.predict(X_val_proc)
results['LinearSVC'] = detailed_report(y_val.values, y_pred_svc, "LinearSVC")

y_pred_lgbm = best_lgbm.predict(X_val_proc)
results['LightGBM'] = detailed_report(y_val.values, y_pred_lgbm, "LightGBM")

# VISUALIZATIONS
print("\n" + "="*70)
print("GENERATING VISUALIZATIONS")
print("="*70)

plot_confusion_matrix(y_val.values, y_pred_svc, "LinearSVC", 
                        save_path=str(OUTPUT_DIR / "cm_svc.png"))
plot_confusion_matrix(y_val.values, y_pred_lgbm, "LightGBM",
                        save_path=str(OUTPUT_DIR / "cm_lgbm.png"))
plot_f1_comparison(results, save_path=str(OUTPUT_DIR / "f1_comparison.png"))
plot_feature_importance_svc(best_svc, feature_names, top_n=15,
                                save_path=str(OUTPUT_DIR / "feat_imp_svc.png"))
plot_feature_importance_lgbm(best_lgbm, feature_names, top_n=30,
                                save_path=str(OUTPUT_DIR / "feat_imp_lgbm.png"))
#plot_cv_tuning(svc_results, 'C', 'LinearSVC', save_path=str(OUTPUT_DIR / "tuning_C.png"))

# SUMMARY
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)

print(f"\n{'Model':<15} {'Macro F1':>12} {'Weighted F1':>12} {'Accuracy':>12}")
print("-"*55)
for name, m in results.items():
    print(f"{name:<15} {m['macro_f1']:>12.4f} {m['weighted_f1']:>12.4f} {m['accuracy']:>12.4f}")

best_name = max(results, key=lambda x: results[x]['macro_f1'])
print(f"\n✅ BEST MODEL: {best_name} (Macro F1: {results[best_name]['macro_f1']:.4f})")

print("\n--- BEST HYPERPARAMETERS ---")
print(f"LinearSVC: {svc_params}")
print(f"LightGBM: {lgbm_params}")


print("\n[1] LOADING DATA")
df_train, df_eval = load_file("data/development.csv", "data/evaluation.csv",sep=",", dtype = {"id": str, "page_rank": int, "label": int})
print(f"  Train: {len(df_train):,} | Eval: {len(df_eval):,}")

# Preprocess
print("\n[2] PREPROCESSING")
df_train = preprocessing_data(df_train)
print(f"  Puliti: {len(df_train):,}")

# Features
print("\n[3] FEATURE ENGINEERING")
df_train = generate_features(df_train, config=config)
df_eval = generate_features(df_eval, config=config)

X = df_train.drop(columns=['label'])
y = df_train['label']

pipe_new = FullPipeline(config=config)

X_tot, y_tot = pipe_new.fit_transform(X, y)

# Submission
print("\n[GENERATING SUBMISSION]")
X_eval_proc = pipe_new.transform(df_eval)
print(f"  Eval Processed: {X_eval_proc.shape}")
print(f"  Using Best Model: {best_name}")
if best_name == 'LinearSVC':
    from sklearn.svm import LinearSVC
    svc = LinearSVC(**svc_params, random_state=RANDOM_STATE)
    svc.fit(X_tot, y_tot)   
    y_eval_pred = svc.predict(X_eval_proc)
else:
    from lightgbm import LGBMClassifier
    lgbm = LGBMClassifier(**lgbm_params)
    lgbm.fit(X_tot, y_tot)
    y_eval_pred = lgbm.predict(X_eval_proc)
    
    

submission = pd.DataFrame({
    'Id': df_eval['Id'],
    'Predicted': y_eval_pred.astype(int)
})
submission.to_csv(OUTPUT_DIR / 'submission_tuned.csv', index=False)
print(f"  Submission: {OUTPUT_DIR / 'submission_tuned.csv'}")

print("\n" + "="*70)
print("TUNING COMPLETATO!")
print("="*70)




[1] LOADING DATA
  Development set: 79,997 samples, 7 features
Evaluation data shape: (20000, 6)
  Train: 79,997 | Eval: 20,000

[2] PREPROCESSING
Converting timestamp to datetime...
Dropping duplicates based on source, title, article, label keeping the most recent one...
 1,368 samples removed
Dropping duplicates that have the same source, title, article but different label...
 2,971 samples removed
Dropping id column...
  Preprocessed Data: 75,658 samples, 6 features
  Puliti: 75,658

[3] FEATURE ENGINEERING

  Train: 60,526 | Val: 15,132
  TF-IDF: 15000 features
  Numerical: 8 features
  CatBoost: 21 features
  Cyclic timestamp: 7 features
  TOTAL: 15036 features


SystemExit: 

In [4]:
import pickle
print("\n[4] LOADING TUNED MODELS AND RESULTS")
for model in ['lgbm','svc']:
    if model == 'lgbm':
        lgbm_results = pd.read_csv(f"model/lgbm_tuning_results.csv")
    else:
        svc_results = pd.read_csv(f"model/svc_tuning_results.csv")


[4] LOADING TUNED MODELS AND RESULTS


In [5]:
lgbm_results.iloc[0]['params']

"{'verbose': -1, 'subsample': 0.8, 'random_state': 42, 'num_leaves': 70, 'n_estimators': 300, 'min_child_samples': 20, 'max_depth': -1, 'learning_rate': 0.1, 'colsample_bytree': 0.7, 'class_weight': 'balanced'}"

In [6]:
lightgbm_params = lgbm_results.iloc[0]['params']

import lightgbm as lgb

model_lightgb = lgb.LGBMClassifier(**eval(lightgbm_params))

In [7]:
model_lightgb

0,1,2
,boosting_type,'gbdt'
,num_leaves,70
,max_depth,-1
,learning_rate,0.1
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [8]:
model_lightgb.fit(X_train_proc, y_train_proc)

0,1,2
,boosting_type,'gbdt'
,num_leaves,70
,max_depth,-1
,learning_rate,0.1
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [9]:
X_train_with_cols = pd.DataFrame(
    X_train_proc.toarray(),
    columns=feature_names)

In [10]:
X_val_with_cols = pd.DataFrame(
    X_val_proc.toarray(),
    columns=feature_names)

In [20]:
def predict_hybrid(df, ml_model):
    # 1. Ottieni tutte le predizioni dal modello ML
    ml_predictions = ml_model.predict(df)  # df[:-1] per eliminare l'ultima colonna

    # 2. Crea l'array di predizioni RSS
    df = pd.concat([df, X_val['article'].reset_index(drop=True)], axis=1)
    rss_labels = df['article'].apply(extract_rss_label)

    # 3. Sostituisci le predizioni con RSS dove rss_labels non è None
    hybrid_predictions = np.where(rss_labels.notna(), rss_labels, ml_predictions)
    
    return hybrid_predictions.tolist(), ml_predictions.tolist(), rss_labels.tolist()

In [21]:
predictions, ml_predictions, rss_labels = predict_hybrid(X_val_with_cols, model_lightgb)

In [22]:
predictions == ml_predictions

True

In [23]:
rss_labels

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 4.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 2.0,
 nan,
 nan,
 nan,
 0.0,
 4.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 5.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 4.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 3.0,
 nan,
 nan,
 nan,
 0.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan

In [19]:
macro_f1 = f1_score(y_val, predictions, average='macro')
weighted_f1 = f1_score(y_val, predictions, average='weighted')
accuracy = (y_val == predictions).mean()

print(f"\n{'GLOBAL METRICS':^40}")
print("-"*40)
print(f"{'Accuracy:':<25} {accuracy:.4f}")
print(f"{'Macro F1:':<25} {macro_f1:.4f}")
print(f"{'Weighted F1:':<25} {weighted_f1:.4f}")


             GLOBAL METRICS             
----------------------------------------
Accuracy:                 0.7345
Macro F1:                 0.7278
Weighted F1:              0.7343


# Retrain

In [25]:
print("\n[1] LOADING DATA")
df_train, df_eval = load_file("data/development.csv", "data/evaluation.csv",sep=",", dtype = {"id": str, "page_rank": int, "label": int})
print(f"  Train: {len(df_train):,} | Eval: {len(df_eval):,}")

# Preprocess
print("\n[2] PREPROCESSING")
df_train = preprocessing_data(df_train)
print(f"  Puliti: {len(df_train):,}")

# Features
print("\n[3] FEATURE ENGINEERING")
df_train = generate_features(df_train, config=config)
df_eval = generate_features(df_eval, config=config)

X = df_train.drop(columns=['label'])
y = df_train['label']

pipe_new = FullPipeline(config=config)

X_tot, y_tot = pipe_new.fit_transform(X, y)

# Submission
print("\n[GENERATING SUBMISSION]")
X_eval_proc = pipe_new.transform(df_eval)
model_lightgb.fit(X_tot, y_tot)

X_eval_proc_with_columns = pd.DataFrame(
    X_eval_proc.toarray(),
    columns=feature_names)

predictions,_,_ = predict_hybrid(X_eval_proc_with_columns, model_lightgb)


submission = pd.DataFrame({
    'Id': df_eval['Id'],
    'Predicted': predictions.astype(int)
})
submission.to_csv(OUTPUT_DIR / 'submission_tuned.csv', index=False)
print(f"  Submission: {OUTPUT_DIR / 'submission_tuned.csv'}")

print("\n" + "="*70)
print("TUNING COMPLETATO!")
print("="*70)






[1] LOADING DATA
  Development set: 79,997 samples, 7 features
Evaluation data shape: (20000, 6)
  Train: 79,997 | Eval: 20,000

[2] PREPROCESSING
Converting timestamp to datetime...
Dropping duplicates based on source, title, article, label keeping the most recent one...
 1,368 samples removed
Dropping duplicates that have the same source, title, article but different label...
 2,971 samples removed
Dropping id column...
  Preprocessed Data: 75,658 samples, 6 features
  Puliti: 75,658

[3] FEATURE ENGINEERING
  TF-IDF: 15000 features
  Numerical: 8 features
  CatBoost: 21 features
  Cyclic timestamp: 7 features
  TOTAL: 15036 features

[GENERATING SUBMISSION]


AttributeError: 'tuple' object has no attribute 'astype'

In [26]:
predictions,_,_ = predict_hybrid(X_eval_proc_with_columns, model_lightgb)


submission = pd.DataFrame({
    'Id': df_eval['Id'],
    'Predicted': predictions.astype(int)
})
submission.to_csv(OUTPUT_DIR / 'submission_tuned.csv', index=False)
print(f"  Submission: {OUTPUT_DIR / 'submission_tuned.csv'}")

print("\n" + "="*70)
print("TUNING COMPLETATO!")
print("="*70)

AttributeError: 'list' object has no attribute 'astype'

In [30]:
submission = pd.DataFrame({
    'Id': df_eval['Id'],
    'Predicted': map(int, predictions)
})
submission.to_csv(OUTPUT_DIR / 'submission_tuned.csv', index=False)
print(f"  Submission: {OUTPUT_DIR / 'submission_tuned.csv'}")

print("\n" + "="*70)
print("TUNING COMPLETATO!")
print("="*70)

  Submission: model/submission_tuned.csv

TUNING COMPLETATO!


In [33]:
results = {}

predictions,_,_ = predict_hybrid(X_val_with_cols, model_lightgb)

results['LightGBM'] = detailed_report(y_val.values, predictions, "LightGBM")

# VISUALIZATIONS
print("\n" + "="*70)
print("GENERATING VISUALIZATIONS")
print("="*70)


plot_confusion_matrix(y_val.values, predictions, "LightGBM",
                        save_path=str(OUTPUT_DIR / "cm_lgbm.png"))
plot_feature_importance_lgbm(model_lightgb, feature_names, top_n=30,
                                save_path=str(OUTPUT_DIR / "feat_imp_lgbm.png"))
#plot_cv_tuning(svc_results, 'C', 'LinearSVC', save_path=str(OUTPUT_DIR / "tuning_C.png"))


DETAILED ANALYSIS - LightGBM

             GLOBAL METRICS             
----------------------------------------
Accuracy:                 0.5104
Macro F1:                 0.4568
Weighted F1:              0.5181

                          METRICS PER CLASS                           
----------------------------------------------------------------------
Classe           Precision     Recall   F1-Score    Support        %
----------------------------------------------------------------------
International News     0.6546     0.5594     0.6033       4507    29.8%
Business            0.5327     0.4259     0.4734       2045    13.5%
Technology          0.7741     0.6684     0.7174       2159    14.3%
Entertainment       0.2640     0.5190     0.3500       1819    12.0%
Sports              0.5789     0.4309     0.4940       1678    11.1%
General News        0.4225     0.5043     0.4598       2352    15.5%
Health              0.2400     0.0629     0.0997        572     3.8%
-------------------

In [31]:
submission

Unnamed: 0,Id,Predicted
0,0,5
1,1,2
2,2,0
3,3,0
4,4,5
...,...,...
19995,19995,2
19996,19996,4
19997,19997,3
19998,19998,0


In [34]:
X_val_with_cols = pd.concat([X_val_with_cols, X_val['article'].reset_index(drop=True)], axis=1)