# Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# RF + Features Eng

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from imblearn.over_sampling import SMOTE

# ==========================================
# 1. LOAD DATA & MERGE SECTOR
# ==========================================
path_data = "/content/drive/MyDrive/KLTN/FDP_VN_2010_2022_Train_Set.csv"
path_sector = "/content/drive/MyDrive/KLTN/unique_company_with_sector (1).csv"

df = pd.read_csv(path_data)
df_sector = pd.read_csv(path_sector)

# Merge Sector
df = df.merge(df_sector[['ticker', 'sector']], left_on='Code', right_on='ticker', how='left')
df['sector'] = df['sector'].fillna('Unknown')

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
raw_features = [f'X{i}' for i in range(1, 20)] + ['SEN']

# A. Altman Z-score Proxy
def calculate_altman(row):
    return 1.2*row['X2'] + 1.4*row['X8'] + 3.3*row['X4'] + 0.6*row['X18'] + 1.0*row['X9']
df['Altman_Z'] = df.apply(calculate_altman, axis=1)

# B. Sector Relative Features
for col in raw_features:
    sector_medians = df.groupby(['Year', 'sector'])[col].transform('median')
    df[f'{col}_rel'] = df[col] - sector_medians

# C. Trend & SEN Interaction
df = df.sort_values(['Code', 'Year'])
df['SEN_delta'] = df.groupby('Code')['SEN'].diff().fillna(0)
df['SEN_Altman'] = df['SEN'] * df['Altman_Z']

# D. Sector Risk (Target Encoding)
train_indices = df[df['Year'] <= 2021].index
overall_mean = df.loc[train_indices, 'Next_year_binary_distress_label'].mean()
sector_stats = df.loc[train_indices].groupby('sector')['Next_year_binary_distress_label'].agg(['count', 'mean'])
smoothing = 10
risk_map = (sector_stats['count'] * sector_stats['mean'] + smoothing * overall_mean) / (sector_stats['count'] + smoothing)

df['Sector_Risk'] = df['sector'].map(risk_map)
df['Sector_Risk'] = df['Sector_Risk'].fillna(overall_mean)

df.fillna(0, inplace=True)

# ==========================================
# 3. SPLIT TRAIN (2010-2021) / TEST (2022)
# ==========================================
df_train = df[df['Year'] <= 2021].copy()
df_test = df[df['Year'] == 2022].copy()

# List Feature (44 features)
x_cols = raw_features
x_rel_cols = [f'{c}_rel' for c in raw_features]
extra_cols = ['Altman_Z', 'Sector_Risk', 'SEN_delta', 'SEN_Altman']
feature_cols = x_cols + x_rel_cols + extra_cols

print(f"--> Tổng số Features: {len(feature_cols)}")

# Scale dữ liệu
scaler = StandardScaler()
X_train_raw = scaler.fit_transform(df_train[feature_cols].values)
y_train_raw = df_train['Next_year_binary_distress_label'].values.astype(int)

X_test = scaler.transform(df_test[feature_cols].values)
y_test = df_test['Next_year_binary_distress_label'].values.astype(int)

# ==========================================
# 4. SMOTE OVERSAMPLING (Thay thế Manual)
# ==========================================
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_raw, y_train_raw)

print(f"    Train shape gốc: {X_train_raw.shape}")
print(f"    Train shape sau SMOTE: {X_train_res.shape}")
print(f"    Label distribution sau SMOTE: {np.bincount(y_train_res)}")

# ==========================================
# 5. HYPERPARAMETER TUNING (RandomizedSearchCV)
# ==========================================

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 15, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']
}

rf_base = RandomForestClassifier(random_state=42, n_jobs=-1)

rf_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train_res, y_train_res)

best_rf = rf_search.best_estimator_
print(f"\n--> Best Parameters: {rf_search.best_params_}")

# ==========================================
# 6. DỰ BÁO & TỐI ƯU THRESHOLD
# ==========================================
y_prob = best_rf.predict_proba(X_test)[:, 1]

# Tìm Threshold tối ưu
precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print(f"\n===== KẾT QUẢ CUỐI CÙNG (Best Threshold: {best_threshold:.4f}) =====")
y_pred_opt = (y_prob >= best_threshold).astype(int)
print(confusion_matrix(y_test, y_pred_opt))
print(classification_report(y_test, y_pred_opt, digits=4))

# ==========================================
# 7. FEATURE IMPORTANCE
# ==========================================
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]

print("\n=== TOP 20 BIẾN QUAN TRỌNG NHẤT ===")
for i in range(20):
    print(f"{i+1}. {feature_cols[indices[i]]}: {importances[indices[i]]:.4f}")

--> Tổng số Features: 44
    Train shape gốc: (11634, 44)
    Train shape sau SMOTE: (17554, 44)
    Label distribution sau SMOTE: [8777 8777]
Fitting 3 folds for each of 50 candidates, totalling 150 fits

--> Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'class_weight': None, 'bootstrap': False}

===== KẾT QUẢ CUỐI CÙNG (Best Threshold: 0.5024) =====
[[728  22]
 [ 83 211]]
              precision    recall  f1-score   support

           0     0.8977    0.9707    0.9327       750
           1     0.9056    0.7177    0.8008       294

    accuracy                         0.8994      1044
   macro avg     0.9016    0.8442    0.8667      1044
weighted avg     0.8999    0.8994    0.8956      1044


=== TOP 20 BIẾN QUAN TRỌNG NHẤT ===
1. X8: 0.1592
2. X8_rel: 0.0665
3. X7: 0.0662
4. X4: 0.0650
5. Altman_Z: 0.0429
6. X4_rel: 0.0357
7. Sector_Risk: 0.0342
8. SEN_Altman: 0.0335
9. X7_rel: 0.0324
10. X6: 0.0220
11

In [3]:
import joblib
import os

save_folder = "/content/drive/MyDrive/KLTN/"

model_package = {
    'model': best_rf,
    'scaler': scaler,
    'threshold': best_threshold,
    'risk_map': risk_map,
    'overall_mean': overall_mean,
    'features': feature_cols
}
save_path = os.path.join(save_folder, 'fdp_rf_model_full.pkl')

joblib.dump(model_package, save_path)

print(f"Lưu file tại: {save_path}")
print("File bao gồm: Model, Scaler, Threshold, Risk Map và Feature List.")

Lưu file tại: /content/drive/MyDrive/KLTN/fdp_rf_model_full.pkl
File bao gồm: Model, Scaler, Threshold, Risk Map và Feature List.
