In [1]:
import pandas as pd

# 1) Load your cleaned table
df = pd.read_csv("cleaned_dataset_final.csv")

# 2) Separate targets immediately
y_binary  = df['risk_class_binary']            # primary: High(1) vs Low(0)
y_multi   = df['risk_class_multiclass']        # secondary: {1,2,3,Unclassified}

# 3) Remove targets from the feature frame
X0 = df.drop(columns=['risk_class_binary', 'risk_class_multiclass'])

# 4) Quick sanity checks
print("X0 shape:", X0.shape)
print("y_binary distribution:\n", y_binary.value_counts(normalize=True).rename("proportion"))
print("\ny_multi distribution:\n", y_multi.value_counts(normalize=True).rename("proportion"))


X0 shape: (68943, 46)
y_binary distribution:
 risk_class_binary
0    0.531918
1    0.468082
Name: proportion, dtype: float64

y_multi distribution:
 risk_class_multiclass
Unclassified    0.496932
2               0.381765
1               0.086318
3               0.034985
Name: proportion, dtype: float64


In [2]:
id_like_cols = [
    'id_event', 'id_device', 'manufacturer_id',
    'uid', 'uid_hash', 'slug_event', 'slug_device',
    'url', 'number_event', 'number_device'
]

X1 = X0.drop(columns=[col for col in id_like_cols if col in X0.columns])

print("✅ ID-like columns dropped")
print("New shape:", X1.shape)


✅ ID-like columns dropped
New shape: (68943, 36)


In [3]:
import numpy as np

X2 = X1.copy()

# Convert date columns safely
date_cols = ['date_initiated_by_firm', 'date_posted', 'date_terminated']
for col in date_cols:
    if col in X2.columns:
        X2[col] = pd.to_datetime(X2[col], errors='coerce')

# Year features
if 'date_initiated_by_firm' in X2.columns:
    X2['year_initiated'] = X2['date_initiated_by_firm'].dt.year

if 'date_posted' in X2.columns:
    X2['year_posted'] = X2['date_posted'].dt.year

# Recall duration
if 'date_initiated_by_firm' in X2.columns and 'date_terminated' in X2.columns:
    X2['recall_duration'] = (X2['date_terminated'] - X2['date_initiated_by_firm']).dt.days
    X2['recall_duration'] = X2['recall_duration'].fillna(-1)

# Termination flag
X2['is_terminated'] = np.where(X2['date_terminated'].notna(), 1, 0)

# Drop raw date columns
X2 = X2.drop(columns=[col for col in date_cols if col in X2.columns])

print("✅ Temporal features engineered")
print("New shape:", X2.shape)
print(X2[['year_initiated','year_posted','recall_duration','is_terminated']].head())


✅ Temporal features engineered
New shape: (68943, 37)
   year_initiated  year_posted  recall_duration  is_terminated
0          2012.0          NaN             -1.0              0
1          2013.0          NaN             -1.0              0
2          2013.0          NaN             -1.0              0
3          2012.0          NaN             -1.0              0
4          2015.0          NaN             -1.0              0


  X2[col] = pd.to_datetime(X2[col], errors='coerce')
  X2[col] = pd.to_datetime(X2[col], errors='coerce')


In [4]:
text_cols = ['description','comment','action_summary']
X3 = X2.drop(columns=[col for col in text_cols if col in X2.columns])

print("✅ Text-heavy columns dropped")
print("New shape:", X3.shape)


✅ Text-heavy columns dropped
New shape: (68943, 36)


In [5]:
high_card_cols = [
    'reason','action','quantity_in_commerce','authorities_link',
    'distributed_to','slug','address','data_notes'
]

X4 = X3.drop(columns=[col for col in high_card_cols if col in X3.columns])

print("✅ High-cardinality categoricals dropped")
print("New shape:", X4.shape)


✅ High-cardinality categoricals dropped
New shape: (68943, 28)


In [6]:
from sklearn.preprocessing import OneHotEncoder

X5 = X4.copy()

# Frequency encode 'status'
if 'status' in X5.columns:
    X5['status_freq'] = X5['status'].map(X5['status'].value_counts())
    X5 = X5.drop(columns=['status'])

# One-hot encode low-cardinality categoricals
one_hot_cols = [
    'implanted','action_classification','type',
    'source','source_manufacturer','country_event','country_device',
    'determined_cause','classification'
]

X5 = pd.get_dummies(X5, columns=[col for col in one_hot_cols if col in X5.columns], drop_first=True)

print("✅ Categorical encoding done")
print("New shape:", X5.shape)


✅ Categorical encoding done
New shape: (68943, 123)


In [7]:
from sklearn.preprocessing import StandardScaler

X6 = X5.copy()

# Numeric columns to scale
numeric_cols = ['year_initiated','year_posted','recall_duration','status_freq']
numeric_cols = [col for col in numeric_cols if col in X6.columns]

scaler = StandardScaler()
X6[numeric_cols] = scaler.fit_transform(X6[numeric_cols])

print("✅ Numeric features scaled")
print("Shape:", X6.shape)
print("Scaled columns:", numeric_cols)


✅ Numeric features scaled
Shape: (68943, 123)
Scaled columns: ['year_initiated', 'year_posted', 'recall_duration', 'status_freq']


In [8]:
from sklearn.model_selection import train_test_split

# Binary target split
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X6, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

print("✅ Binary target split")
print("Train shape:", X_train_bin.shape, "Test shape:", X_test_bin.shape)
print("Train distribution:\n", y_train_bin.value_counts(normalize=True))
print("Test distribution:\n", y_test_bin.value_counts(normalize=True))

# Multiclass target split
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X6, y_multi, test_size=0.2, random_state=42, stratify=y_multi
)

print("\n✅ Multiclass target split")
print("Train shape:", X_train_multi.shape, "Test shape:", X_test_multi.shape)
print("Train distribution:\n", y_train_multi.value_counts(normalize=True))
print("Test distribution:\n", y_test_multi.value_counts(normalize=True))


✅ Binary target split
Train shape: (55154, 123) Test shape: (13789, 123)
Train distribution:
 risk_class_binary
0    0.531911
1    0.468089
Name: proportion, dtype: float64
Test distribution:
 risk_class_binary
0    0.531946
1    0.468054
Name: proportion, dtype: float64

✅ Multiclass target split
Train shape: (55154, 123) Test shape: (13789, 123)
Train distribution:
 risk_class_multiclass
Unclassified    0.496936
2               0.381767
1               0.086322
3               0.034975
Name: proportion, dtype: float64
Test distribution:
 risk_class_multiclass
Unclassified    0.496918
2               0.381754
1               0.086301
3               0.035028
Name: proportion, dtype: float64


In [11]:
# 1) Find any non-numeric columns that slipped through
obj_cols = X_train_bin.select_dtypes(include='object').columns.tolist()
print("Object columns still present:", obj_cols)

# 2) Drop them from BOTH train and test to keep columns aligned
if obj_cols:
    X_train_bin = X_train_bin.drop(columns=obj_cols)
    X_test_bin  = X_test_bin.drop(columns=obj_cols)

print("Shapes after dropping object cols:", X_train_bin.shape, X_test_bin.shape)


Object columns still present: ['icij_notes', 'created_at_event', 'updated_at_event', 'code', 'name', 'risk_class', 'created_at_device', 'updated_at_device', 'name_manufacturer', 'parent_company', 'created_at', 'updated_at']
Shapes after dropping object cols: (55154, 111) (13789, 111)


In [12]:
# --- Logistic Regression ---
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
import numpy as np
import pandas as pd

def eval_binary(y_true, y_pred, y_proba):
    acc  = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    auc  = roc_auc_score(y_true, y_proba)
    cm   = confusion_matrix(y_true, y_pred)
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print(f"ROC-AUC:   {auc:.4f}")
    print("\nConfusion Matrix:\n", cm)

logreg = LogisticRegression(max_iter=500, class_weight='balanced', n_jobs=-1, random_state=42)
logreg.fit(X_train_bin, y_train_bin)

log_pred  = logreg.predict(X_test_bin)
log_proba = logreg.predict_proba(X_test_bin)[:, 1]

print("=== Logistic Regression (binary) ===")
eval_binary(y_test_bin, log_pred, log_proba)

# Top weighted features
feature_names = np.array(X_train_bin.columns)
coefs = logreg.coef_[0]
top_pos_idx = np.argsort(coefs)[-15:][::-1]
top_neg_idx = np.argsort(coefs)[:15]
top_features = pd.DataFrame({
    "feature": np.r_[feature_names[top_pos_idx], feature_names[top_neg_idx]],
    "coef":    np.r_[coefs[top_pos_idx],        coefs[top_neg_idx]]
})
print("\nTop weighted features (LogReg):\n", top_features)


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values