In [1]:
from utils.processing import *
import numpy as np
import yaml
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# First Trials

In [2]:
with open("config/config.yaml", 'r') as ymlfile:
    config = yaml.safe_load(ymlfile)
    
LABEL_NAMES = config['LABEL_NAMES']
RANDOM_STATE = config['RANDOM_STATE']

In [3]:
# Load

OUTPUT_DIR = Path("model/")
print("\n[1] LOADING DATA")
df_train, df_eval = load_file("data/development.csv", "data/evaluation.csv",sep=",", dtype = {"id": str, "page_rank": int, "label": int})
print(f"  Train: {len(df_train):,} | Eval: {len(df_eval):,}")

# Preprocess
print("\n[2] PREPROCESSING")
df_train = preprocessing_data(df_train)
print(f"  Puliti: {len(df_train):,}")

# Features
print("\n[3] FEATURE ENGINEERING")
df_train = generate_features(df_train, config=config)
df_eval = generate_features(df_eval, config=config)

# Split
X = df_train.drop(columns=['label'])
y = df_train['label']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
print(f"\n  Train: {len(X_train):,} | Val: {len(X_val):,}")

# Pipeline
pipe = FullPipeline(config=config)
X_train_proc, y_train_proc = pipe.fit_transform(X_train, y_train)
X_val_proc = pipe.transform(X_val)
feature_names = pipe.feature_names_

# TUNING
try:
    import pickle
    print("\n[4] LOADING TUNED MODELS AND RESULTS")
    for model in ['lgbm','svc']:
        with open(f"model/best_model_{model}.pkl", "rb") as f:
            if model == 'lgbm':
                best_lgbm = pickle.load(f)
            else:
                best_svc = pickle.load(f)
        with open(f"model/best_params_{model}.pkl", "rb") as f:
            if model == 'lgbm':
                lgbm_params = pickle.load(f)
            else:
                svc_params = pickle.load(f)
        if model == 'lgbm':
            lgbm_results = pd.read_csv(f"model/lgbm_tuning_results.csv")
        else:
            svc_results = pd.read_csv(f"model/svc_tuning_results.csv")
except:
    best_svc, svc_params, svc_results = tune_linear_svc(X_train_proc, y_train_proc, cv=5)
    best_lgbm, lgbm_params, lgbm_results = tune_lightgbm(X_train_proc, y_train_proc, cv=5)

# EVALUATION
print("\n" + "="*70)
print("FINAL EVALUATION ON VALIDATION SET")
print("="*70)

results = {}

y_pred_svc = best_svc.predict(X_val_proc)
results['LinearSVC'] = detailed_report(y_val.values, y_pred_svc, "LinearSVC")

y_pred_lgbm = best_lgbm.predict(X_val_proc)
results['LightGBM'] = detailed_report(y_val.values, y_pred_lgbm, "LightGBM")

# VISUALIZATIONS
print("\n" + "="*70)
print("GENERATING VISUALIZATIONS")
print("="*70)

plot_confusion_matrix(y_val.values, y_pred_svc, "LinearSVC", 
                        save_path=str(OUTPUT_DIR / "cm_svc.png"))
plot_confusion_matrix(y_val.values, y_pred_lgbm, "LightGBM",
                        save_path=str(OUTPUT_DIR / "cm_lgbm.png"))
plot_f1_comparison(results, save_path=str(OUTPUT_DIR / "f1_comparison.png"))
plot_feature_importance_svc(best_svc, feature_names, top_n=15,
                                save_path=str(OUTPUT_DIR / "feat_imp_svc.png"))
plot_feature_importance_lgbm(best_lgbm, feature_names, top_n=30,
                                save_path=str(OUTPUT_DIR / "feat_imp_lgbm.png"))
#plot_cv_tuning(svc_results, 'C', 'LinearSVC', save_path=str(OUTPUT_DIR / "tuning_C.png"))

# SUMMARY
print("\n" + "="*70)
print("FINAL SUMMARY")
print("="*70)

print(f"\n{'Model':<15} {'Macro F1':>12} {'Weighted F1':>12} {'Accuracy':>12}")
print("-"*55)
for name, m in results.items():
    print(f"{name:<15} {m['macro_f1']:>12.4f} {m['weighted_f1']:>12.4f} {m['accuracy']:>12.4f}")

best_name = max(results, key=lambda x: results[x]['macro_f1'])
print(f"\n✅ BEST MODEL: {best_name} (Macro F1: {results[best_name]['macro_f1']:.4f})")

print("\n--- BEST HYPERPARAMETERS ---")
print(f"LinearSVC: {svc_params}")
print(f"LightGBM: {lgbm_params}")


print("\n[1] LOADING DATA")
df_train, df_eval = load_file("data/development.csv", "data/evaluation.csv",sep=",", dtype = {"id": str, "page_rank": int, "label": int})
print(f"  Train: {len(df_train):,} | Eval: {len(df_eval):,}")

# Preprocess
print("\n[2] PREPROCESSING")
df_train = preprocessing_data(df_train)
print(f"  Puliti: {len(df_train):,}")

# Features
print("\n[3] FEATURE ENGINEERING")
df_train = generate_features(df_train, config=config)
df_eval = generate_features(df_eval, config=config)

X = df_train.drop(columns=['label'])
y = df_train['label']

pipe_new = FullPipeline(config=config)

X_tot, y_tot = pipe_new.fit_transform(X, y)

# Submission
print("\n[GENERATING SUBMISSION]")
X_eval_proc = pipe_new.transform(df_eval)
print(f"  Eval Processed: {X_eval_proc.shape}")
print(f"  Using Best Model: {best_name}")
if best_name == 'LinearSVC':
    from sklearn.svm import LinearSVC
    svc = LinearSVC(**svc_params, random_state=RANDOM_STATE)
    svc.fit(X_tot, y_tot)   
    y_eval_pred = svc.predict(X_eval_proc)
else:
    from lightgbm import LGBMClassifier
    lgbm = LGBMClassifier(**lgbm_params)
    lgbm.fit(X_tot, y_tot)
    y_eval_pred = lgbm.predict(X_eval_proc)
    
    

submission = pd.DataFrame({
    'Id': df_eval['Id'],
    'Predicted': y_eval_pred.astype(int)
})
submission.to_csv(OUTPUT_DIR / 'submission_tuned.csv', index=False)
print(f"  Submission: {OUTPUT_DIR / 'submission_tuned.csv'}")

print("\n" + "="*70)
print("TUNING COMPLETATO!")
print("="*70)




[1] LOADING DATA
  Development set: 79,997 samples, 7 features
Evaluation data shape: (20000, 6)
  Train: 79,997 | Eval: 20,000

[2] PREPROCESSING
Converting timestamp to datetime...
Dropping duplicates based on source, title, article, label keeping the most recent one...
 1,368 samples removed
Dropping duplicates that have the same source, title, article but different label...
 2,971 samples removed
Dropping id column...
  Preprocessed Data: 75,658 samples, 6 features
  Puliti: 75,658

[3] FEATURE ENGINEERING

  Train: 60,526 | Val: 15,132
  TF-IDF: 15000 features
  Numerical: 6 features
  CatBoost: 21 features
  Cyclic timestamp: 7 features
  TOTAL: 15034 features

[4] LOADING TUNED MODELS AND RESULTS

FINAL EVALUATION ON VALIDATION SET

DETAILED ANALYSIS - LinearSVC

             GLOBAL METRICS             
----------------------------------------
Accuracy:                 0.7205
Macro F1:                 0.7080
Weighted F1:              0.7150

                          METRICS PE

In [4]:
X

Unnamed: 0,source,title,article,page_rank,timestamp,title_suffix,first_link_domain,n_links,n_images,n_ads,n_feeds,article_length,combined_text
0,Yahoo,Bancrofts to discuss Dow Jones bid with News C...,"<p><a href=""http://us.rd.yahoo.com/dailynews/r...",5,2007-06-01 02:29:04,,us.rd.yahoo.com,2,1,0,0,1.0,bancrofts to discuss dow jones bid with news c...
1,BBC,Pollution turns Yellow River red,A stretch of China's Yellow River runs red for...,5,2006-11-22 14:23:53,,,0,0,0,0,1.0,pollution turns yellow river red pollution tur...
2,RedNova,"Space Station Not Used Sufficiently, Glenn Says","COLUMBUS, Ohio (AP) -- The country is not gett...",3,2007-02-22 01:54:25,,,0,0,0,0,1.0,space station not used sufficiently glenn spa...
3,Arizona,"Manly men: Crowe, bodyguard hug, make up",Russell Crowe is taking the blame for a fight ...,5,2004-08-31 19:00:43,,,0,0,0,0,1.0,manly men crowe bodyguard hug make up manly me...
4,Yahoo,Microsoft Looks to Expand Windows at Home (AP),AP - Efforts by Microsoft Corp. and the PC ind...,5,NaT,AP,,0,0,0,0,1.0,microsoft looks to expand windows at home ap m...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75653,Yahoo,BMW's 2nd-quarter net profit up \\n (AP)\\n,"<p><a href=""http://us.rd.yahoo.com/dailynews/r...",5,2006-08-02 16:32:20,,us.rd.yahoo.com,2,1,0,0,1.0,bmw s 2nd quarter net profit up ap bmw s 2nd q...
75654,Yahoo,Missing Ga. hiker believed dead \\n (AP)\\n,"<p><a href=""http://us.rd.yahoo.com/dailynews/r...",5,2008-01-06 03:12:45,,us.rd.yahoo.com,2,1,0,0,1.0,missing ga hiker believed dead ap missing ga h...
75655,Yahoo,George Michael Hits Back in War of Words with ...,Reuters - Singer George Michael has issued a\\...,5,NaT,Reuters,,0,0,0,0,1.0,george michael hits back in war of words with ...
75656,BBC,MI6 'Diana-style' plot dismissed,An ex-MI6 man who told Mohammed Al Fayed of pl...,5,2008-02-13 19:54:06,,,0,0,0,0,1.0,mi6 diana style plot dismissed mi6 diana style...


In [5]:
submission.iloc[28]

Id           28
Predicted     4
Name: 28, dtype: int64

In [6]:
import pickle

for model in ['lgbm','svc']:
    with open(f'model/best_model_{model}.pkl', 'wb') as f:
        if model=='lgbm':
            pickle.dump(best_lgbm, f)
        else:
            pickle.dump(best_svc, f)
    with open(f'model/best_params_{model}.pkl', 'wb') as f:
        if model=='lgbm':
            pickle.dump(lgbm_params, f)
        else:
            pickle.dump(svc_params, f)
    
    if model == 'lgbm':
        lgbm_results.to_csv(f'model/lgbm_tuning_results.csv', index=False)
    else:
        svc_results.to_csv(f'model/svc_tuning_results.csv', index=False)
    
    



In [7]:
lgbm_params

{'verbose': -1,
 'subsample': 0.8,
 'random_state': 42,
 'num_leaves': 70,
 'n_estimators': 300,
 'min_child_samples': 20,
 'max_depth': -1,
 'learning_rate': 0.1,
 'colsample_bytree': 0.7,
 'class_weight': 'balanced'}

In [8]:
svc_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_loss,param_max_iter,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,41.808015,1.648096,0.017114,0.005129,0.1,balanced,squared_hinge,2000,"{'C': 0.1, 'class_weight': 'balanced', 'loss':...",0.705411,...,0.701534,0.002141,1,0.782346,0.781826,0.783877,0.783951,0.78158,0.782716,0.001009
1,39.589021,0.673211,0.026485,0.018371,0.05,balanced,squared_hinge,2000,"{'C': 0.05, 'class_weight': 'balanced', 'loss'...",0.699209,...,0.697552,0.001265,2,0.755464,0.754766,0.756628,0.756354,0.754816,0.755606,0.000768
2,48.722066,2.048436,0.016326,0.005676,0.5,balanced,squared_hinge,2000,"{'C': 0.5, 'class_weight': 'balanced', 'loss':...",0.698107,...,0.692018,0.003184,3,0.854954,0.856083,0.85731,0.857909,0.857273,0.856706,0.001057
3,52.177038,4.152274,0.018768,0.007443,1.0,balanced,squared_hinge,2000,"{'C': 1.0, 'class_weight': 'balanced', 'loss':...",0.685705,...,0.679576,0.00347,4,0.8876,0.889561,0.889471,0.889816,0.890082,0.889306,0.000879
4,25.428463,0.770374,0.021528,0.004896,0.01,balanced,squared_hinge,2000,"{'C': 0.01, 'class_weight': 'balanced', 'loss'...",0.667689,...,0.665714,0.002689,5,0.691349,0.689833,0.68871,0.691444,0.689408,0.690149,0.00108
5,50.898593,4.30339,0.011583,0.000667,2.0,balanced,squared_hinge,2000,"{'C': 2.0, 'class_weight': 'balanced', 'loss':...",0.668627,...,0.665022,0.002547,6,0.91824,0.920269,0.920916,0.920634,0.92045,0.920102,0.000955
6,0.258786,0.006921,0.0,0.0,0.01,balanced,hinge,2000,"{'C': 0.01, 'class_weight': 'balanced', 'loss'...",,...,,,7,,,,,,,
7,0.044191,0.00587,0.0,0.0,0.05,balanced,hinge,2000,"{'C': 0.05, 'class_weight': 'balanced', 'loss'...",,...,,,7,,,,,,,
8,0.033654,0.006759,0.0,0.0,0.1,balanced,hinge,2000,"{'C': 0.1, 'class_weight': 'balanced', 'loss':...",,...,,,7,,,,,,,
9,0.027746,0.007649,0.0,0.0,0.5,balanced,hinge,2000,"{'C': 0.5, 'class_weight': 'balanced', 'loss':...",,...,,,7,,,,,,,


In [9]:
best_svc.save(OUTPUT_DIR / "best_svc_model.pkl")
svc_params
svc_results

AttributeError: 'LinearSVC' object has no attribute 'save'

In [14]:
import pickle

# save
with open('model/best_svc_model.pkl','wb') as f:
    pickle.dump(best_svc,f)

In [15]:
import pickle

# save
with open('model/best_lgbm_model.pkl','wb') as f:
    pickle.dump(best_lgbm,f)

In [16]:
best_lgbm

0,1,2
,boosting_type,'gbdt'
,num_leaves,70
,max_depth,-1
,learning_rate,0.1
,n_estimators,300
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
import sys
sys.exit()

In [21]:
dev_df, _ = load_file("data/development.csv", sep=",", dtype = {"id": str, "page_rank": int, "label": int})
dev_df_original = dev_df.copy()

  Development set: 79,997 samples, 7 features


# Build pipeline

In [28]:
from sklearn.model_selection import train_test_split


df_clean = preprocessing_data(dev_df_original)

df_clean = generate_features(df_clean, config=config)

X = df_clean.drop('label', axis=1)
y = df_clean['label']



X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

Converting timestamp to datetime...
Dropping duplicates based on source, title, article, label keeping the most recent one...
 1,368 samples removed
Dropping duplicates that have the same source, title, article but different label...
 2,971 samples removed
Dropping id column...
  Preprocessed Data: 75,658 samples, 6 features


In [29]:
pipe = FullPipeline(config=config)

In [30]:
pipe.fit_transform(X_train, y_train)

InvalidParameterError: The 'ngram_range' parameter of TfidfVectorizer must be an instance of 'tuple'. Got '(1, 2)' instead.

In [None]:
import sys
sys.exit()

In [5]:
tfidf_args = {
    'max_features': 10000,
    'ngram_range': (1, 2),
    'min_df': 5,
    'max_df': 0.8,
    'token_pattern': r'(?u)\b[a-zA-Z]{3,}\b',
    'stop_words': 'english',
    'sublinear_tf': True
}

import warnings
warnings.filterwarnings("ignore")

preproc_pipe = pipeline(tfidf_args)

preproc_pipe.fit_transform(X_train, y_train)
feature_names = preproc_pipe.named_steps['processor'].get_feature_names_out()

X_train_processed = preproc_pipe.fit_transform(X_train, y_train)


X_val_processed = preproc_pipe.transform(X_val)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, classification_report
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.class_weight import compute_class_weight

from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.class_weight import compute_class_weight

from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

# Calcolo dei pesi per Naive Bayes (per gestire lo sbilanciamento)
weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
priors = weights / weights.sum()

# Modelli aggiornati
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000, C = 0.05, random_state=42),
    "Linear SVM": LinearSVC(class_weight='balanced', max_iter=2000, dual=False, C = 0.1, random_state=42),
   # "Multinomial NB": MultinomialNB(class_prior=priors),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, random_state=42)
}

models["LightGBM"] = LGBMClassifier(
    class_weight='balanced', 
    n_estimators=100, 
    random_state=42, 
    n_jobs=-1,
    importance_type='gain' # 'gain' è meglio di 'split' per capire l'impatto reale
)

results = {}

for name, clf in models.items():
    full_pipe = Pipeline([
        ('preprocessor', preproc_pipe),
        ('classifier', clf)
    ])
    
    # Cross-Validation
    cv_f1 = cross_val_score(full_pipe, X_train, y_train, cv=3, scoring='f1_macro').mean()
    
    # Training finale
    full_pipe.fit(X_train, y_train)
    
    # Test
    y_pred = full_pipe.predict(X_val)
    test_f1 = f1_score(y_val, y_pred, average='macro')
    
    results[name] = {"cv_f1": cv_f1, "test_f1": test_f1, "pipe": full_pipe}
    print(f"{name:.<25} CV: {cv_f1:.4f} | Test: {test_f1:.4f}")
    
    
def get_top_features_summary(results, top_n=10):
    # Recuperiamo i nomi dal preprocessor della Logistic Regression (o qualsiasi altro)
    feature_names = results["Logistic Regression"]["pipe"].named_steps['preprocessor'].get_feature_names_out()
    
    for name, data in results.items():
        print(f"\n{'='*20} {name.upper()} {'='*20}")
        clf = data["pipe"].named_steps['classifier']
        
        # CASO 1: Modelli Lineari e NB (Coefficienti per classe)
        if hasattr(clf, 'coef_') or hasattr(clf, 'feature_log_prob_'):
            weights = clf.feature_log_prob_ if name == "Multinomial NB" else clf.coef_
            
            # Se il modello è binario, coef_ ha una sola riga, se multiclasse ne ha N
            n_classes = weights.shape[0] if len(weights.shape) > 1 else 1
            
            for i in range(n_classes):
                w = weights[i] if n_classes > 1 else weights
                top_indices = np.argsort(w)[-top_n:]
                top_words = [feature_names[idx].split("__")[-1] for idx in top_indices]
                label = LABEL_NAMES[i] if i < len(LABEL_NAMES) else i
                print(f"Classe {label}: {', '.join(top_words[::-1])}")
        
        # CASO 2: Modelli ad Albero (Importanza Globale: RF, XGBoost, LightGBM)
        elif hasattr(clf, 'feature_importances_'):
            importances = clf.feature_importances_
            top_indices = np.argsort(importances)[-top_n:]
            # Puliamo i nomi rimuovendo i prefissi della pipeline per leggibilità
            top_features = [feature_names[idx].split("__")[-1] for idx in top_indices]
            print(f"Top Feature Globali: {', '.join(top_features[::-1])}")

# Eseguiamo il riepilogo
get_top_features_summary(results)

Logistic Regression...... CV: 0.6429 | Test: 0.6647
Linear SVM............... CV: 0.6965 | Test: 0.7054
Random Forest............ CV: 0.6612 | Test: 0.6743
XGBoost.................. CV: 0.6844 | Test: 0.6951
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.975764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 231520
[LightGBM] [Info] Number of data points in the train set: 40350, number of used features: 7412
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Auto-choosing col-wise multi-threading,

In [7]:
X_train

Unnamed: 0,source,title,article,page_rank,timestamp
69842,CNET,Everex hits the books with $298 open-source de...,Blog: Everex announces Wal-Mart will carry its...,3,2007-07-19 21:50:09
52331,BBC,Pakistan buries Red Mosque dead,The bodies of dozens of people killed after Pa...,5,2007-07-12 14:18:01
11157,Slashdot,Mozilla Thunderbird Reaches 1.0,An anonymous reader writes &quot;Mozilla Thund...,5,NaT
8112,Yahoo,Sudan promises to let sick Darfur rebel travel...,"<p><a href=""http://us.rd.yahoo.com/dailynews/r...",5,2007-09-04 09:48:01
29337,Radio,"Russian Tax Authorities Give Yukos Another, La...",3 September 2004 -- Russia&#39;s tax authoriti...,5,2004-09-08 09:37:18
...,...,...,...,...,...
64737,Yahoo,Alstom wins 130 million Euro contract for subw...,AFP - French industrial giant Alstom Group has...,5,NaT
66409,New,City Room: Baseball Officials Miss Gang Hearing,Neither Major League Baseball nor its official...,5,2007-12-14 00:03:06
26111,Yahoo,Swedes beam poetry into outer space (Reuters),Reuters - Swedish poets have broadcast their w...,5,NaT
13549,Voice,Powell Cancels Athens Trip,The US State Department says Secretary Colin P...,5,NaT


In [8]:
def plot_results(results_df):
    plt.figure(figsize=(10, 6))
    df_melt = results_df.melt(id_vars="Model", var_name="Metric", value_name="Score")
    sns.barplot(data=df_melt, x="Model", y="Score", hue="Metric", palette="viridis")
    plt.title("Comparazione Modelli - Macro F1 Score")
    plt.ylim(0, 1)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

def show_top_words(trained_pipelines, label_names, top_n=10):
    # Recuperiamo i nomi delle feature dall'ultimo modello addestrato
    any_pipe = list(trained_pipelines.values())[0]
    feat_names = any_pipe.named_steps['proc'].get_feature_names_out()

    for name, pipe in trained_pipelines.items():
        clf = pipe.named_steps['model']
        print(f"\n--- Top Words per Classe ({name}) ---")
        
        if hasattr(clf, 'coef_') or hasattr(clf, 'feature_log_prob_'):
            weights = clf.feature_log_prob_ if name == "Naive Bayes" else clf.coef_
            for i, label in enumerate(label_names):
                top_ids = np.argsort(weights[i])[-top_n:]
                words = [feat_names[idx].split('__')[-1] for idx in top_ids] # Pulizia prefissi
                print(f"{label}: {', '.join(words[::-1])}")

In [9]:
plot_results(results)

AttributeError: 'dict' object has no attribute 'melt'

<Figure size 1000x600 with 0 Axes>