# Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Using cached torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
Using cached torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


# COMPARISION

In [57]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_leaf=4,
    min_samples_split=10,
    max_features='sqrt',
    bootstrap=True,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

In [58]:
# ===============================
# BASELINE: 20 FEATURES
# ===============================
base_features = [f'X{i}' for i in range(1, 20)] + ['SEN']

X_train = df_train[base_features].values
y_train = y_train_raw
X_test  = df_test[base_features].values
y_test  = y_test

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

rf.fit(X_train_sm, y_train_sm)

y_pred = rf.predict(X_test)

print("\n===== BASELINE (20 FEATURES) =====")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


===== BASELINE (20 FEATURES) =====
[[707  43]
 [ 80 214]]
              precision    recall  f1-score   support

           0     0.8983    0.9427    0.9200       750
           1     0.8327    0.7279    0.7768       294

    accuracy                         0.8822      1044
   macro avg     0.8655    0.8353    0.8484      1044
weighted avg     0.8799    0.8822    0.8796      1044



In [59]:
# ===============================
# FEATURE ENGINEERING: 44 FEATURES
# ===============================
fe_features = (
    raw_features +
    [f'{c}_rel' for c in raw_features] +
    ['Altman_Z', 'Sector_Risk', 'SEN_delta', 'SEN_Altman']
)

X_train = df_train[fe_features].values
X_test  = df_test[fe_features].values

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train_raw)

rf.fit(X_train_sm, y_train_sm)

y_pred = rf.predict(X_test)

print("\n===== FEATURE ENGINEERING (44 FEATURES) =====")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))


===== FEATURE ENGINEERING (44 FEATURES) =====
[[715  35]
 [ 83 211]]
              precision    recall  f1-score   support

           0     0.8960    0.9533    0.9238       750
           1     0.8577    0.7177    0.7815       294

    accuracy                         0.8870      1044
   macro avg     0.8769    0.8355    0.8526      1044
weighted avg     0.8852    0.8870    0.8837      1044



# RF + Features Eng

In [None]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import TimeSeriesSplit

# ==========================================
# 1. LOAD DATA & MERGE SECTOR
# ==========================================
path_data = "/content/drive/MyDrive/KLTN/FDP_VN_2010_2022_Train_Set.csv"
path_sector = "/content/drive/MyDrive/KLTN/unique_company_with_sector (1).csv"

df = pd.read_csv(path_data)
df_sector = pd.read_csv(path_sector)

# Merge Sector
df = df.merge(df_sector[['ticker', 'sector']], left_on='Code', right_on='ticker', how='left')
df['sector'] = df['sector'].fillna('Unknown')

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================
raw_features = [f'X{i}' for i in range(1, 20)] + ['SEN']

def calculate_altman(row):
    return 1.2*row['X2'] + 1.4*row['X8'] + 3.3*row['X4'] + 0.6*row['X18'] + 1.0*row['X9']
df['Altman_Z'] = df.apply(calculate_altman, axis=1)

for col in raw_features:
    df[col] = df[col].fillna(0)
    sector_medians = df.groupby(['Year', 'sector'])[col].transform('median')
    df[f'{col}_rel'] = df[col] - sector_medians

# C. Trend & SEN Interaction
df = df.sort_values(['Code', 'Year'])
df['SEN_delta'] = df.groupby('Code')['SEN'].diff().fillna(0)
df['SEN_Altman'] = df['SEN'] * df['Altman_Z']

# ------------------------------------------------------------------------------
# D. SECTOR RISK
# ------------------------------------------------------------------------------
df = df.sort_values(['sector', 'Year'])

def calculate_expanding_risk(x):
    return x.shift(1).expanding().mean()

df['Sector_Risk'] = df.groupby('sector')['Next_year_binary_distress_label'].transform(calculate_expanding_risk)

overall_train_mean = df.loc[df['Year'] <= 2021, 'Next_year_binary_distress_label'].mean()
df['Sector_Risk'] = df['Sector_Risk'].fillna(overall_train_mean)

df.fillna(0, inplace=True)
df = df.sort_values(['Code', 'Year'])

# ==========================================
# 3. SPLIT TRAIN (2010-2021) / TEST (2022)
# ==========================================
df_train = df[df['Year'] <= 2021].copy()
df_test = df[df['Year'] == 2022].copy()

x_cols = raw_features
x_rel_cols = [f'{c}_rel' for c in raw_features]
extra_cols = ['Altman_Z', 'Sector_Risk', 'SEN_delta', 'SEN_Altman']
feature_cols = x_cols + x_rel_cols + extra_cols

print(f"    Tổng số Features: {len(feature_cols)}")

scaler = StandardScaler()
X_train_raw = scaler.fit_transform(df_train[feature_cols].values)
y_train_raw = df_train['Next_year_binary_distress_label'].values.astype(int)

X_test = scaler.transform(df_test[feature_cols].values)
y_test = df_test['Next_year_binary_distress_label'].values.astype(int)

# ==========================================
# 4. SMOTE OVERSAMPLING
# ==========================================
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_raw, y_train_raw)

print(f"    Train shape gốc: {X_train_raw.shape}")
print(f"    Train shape sau SMOTE: {X_train_res.shape}")

# ==========================================
# 5. HYPERPARAMETER TUNING
# ==========================================

param_dist = {
    'n_estimators': [200, 300, 400],
    'max_depth': [10, 15, 20],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [2, 4, 8],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'class_weight': [None, 'balanced']
}

rf_base = RandomForestClassifier(random_state=42, n_jobs=-1, max_samples=0.9)
tscv = TimeSeriesSplit(n_splits=3)

rf_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=param_dist,
    n_iter=50,
    scoring='f1',
    cv=tscv,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train_res, y_train_res)
best_rf = rf_search.best_estimator_
print(f"\n--> Best Parameters: {rf_search.best_params_}")

# ==========================================
# 6. DỰ BÁO & TỐI ƯU THRESHOLD
# ==========================================
y_train_prob = best_rf.predict_proba(X_train_raw)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(
    y_train_raw, y_train_prob
)

f1_scores = 2 * precisions * recalls / (precisions + recalls + 1e-10)
best_threshold = thresholds[np.argmax(f1_scores)]

print(f"\nBest Threshold: {best_threshold:.4f}")

print(f"\n===== KẾT QUẢ TEST 2022 (Best Threshold: {best_threshold:.4f}) =====")
y_pred_opt = (y_prob >= best_threshold).astype(int)

print(confusion_matrix(y_test, y_pred_opt))
print(classification_report(y_test, y_pred_opt, digits=4))

    Tổng số Features: 44
    Train shape gốc: (11634, 44)
    Train shape sau SMOTE: (17554, 44)
Fitting 3 folds for each of 50 candidates, totalling 150 fits

--> Best Parameters: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'class_weight': None, 'bootstrap': True}

===== KẾT QUẢ TEST 2022 (Best Threshold: 0.5218) =====
[[727  23]
 [ 83 211]]
              precision    recall  f1-score   support

           0     0.8975    0.9693    0.9321       750
           1     0.9017    0.7177    0.7992       294

    accuracy                         0.8985      1044
   macro avg     0.8996    0.8435    0.8656      1044
weighted avg     0.8987    0.8985    0.8947      1044



In [48]:
y_train_prob = best_rf.predict_proba(X_train_raw)[:, 1]
y_train_pred = (y_train_prob >= best_threshold).astype(int)

print("\n===== TRAIN PERFORMANCE =====")
print(confusion_matrix(y_train_raw, y_train_pred))
print(classification_report(y_train_raw, y_train_pred, digits=4))


===== TRAIN PERFORMANCE =====
[[8644  133]
 [ 163 2694]]
              precision    recall  f1-score   support

           0     0.9815    0.9848    0.9832      8777
           1     0.9530    0.9429    0.9479      2857

    accuracy                         0.9746     11634
   macro avg     0.9672    0.9639    0.9655     11634
weighted avg     0.9745    0.9746    0.9745     11634



In [None]:
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]
print("\n=== TOP 20 BIẾN QUAN TRỌNG NHẤT ===")
for i in range(20):
    print(f"{i+1}. {feature_cols[indices[i]]}: {importances[indices[i]]:.4f}")


=== TOP 20 BIẾN QUAN TRỌNG NHẤT ===
1. X8: 0.1471
2. X8_rel: 0.0904
3. X4: 0.0639
4. X7: 0.0593
5. Altman_Z: 0.0490
6. X4_rel: 0.0405
7. SEN_Altman: 0.0348
8. Sector_Risk: 0.0331
9. X7_rel: 0.0304
10. X6: 0.0234
11. X3: 0.0192
12. X6_rel: 0.0185
13. X15: 0.0162
14. X16: 0.0153
15. SEN: 0.0150
16. X2: 0.0149
17. X9: 0.0148
18. X17: 0.0145
19. X15_rel: 0.0143
20. X16_rel: 0.0134


In [None]:
save_folder = "/content/drive/MyDrive/KLTN/"
model_package = {
    'model': best_rf,
    'scaler': scaler,
    'threshold': best_threshold,
    'risk_map': risk_map,
    'overall_mean': overall_mean,
    'features': feature_cols
}
save_path = os.path.join(save_folder, 'fdp_rf_model_final.pkl')
joblib.dump(model_package, save_path)
print(f"--> Lưu model tại: {save_path}")

--> Lưu model tại: /content/drive/MyDrive/KLTN/fdp_rf_model_final.pkl
