In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import pickle

# Load your dataset
df = pd.read_csv('cleaned_data.csv')  

# Define the selected features and target
selected_features = [
    ' ROA(C) before interest and depreciation before interest',
       ' Net Value Per Share (B)', ' Operating Profit Per Share (Yuan ¥)',
       ' Per Share Net profit before tax (Yuan ¥)', ' Net worth/Assets',
       ' Retained Earnings to Total Assets', ' Total expense/Assets',
       ' Current Liability to Current Assets', ' Liability-Assets Flag',
       " Net Income to Stockholder's Equity"
]
X = df[selected_features]
y = df['Bankrupt?']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train the model
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)

# Save the scaler and model
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)


[LightGBM] [Info] Number of positive: 5293, number of negative: 5265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2298
[LightGBM] [Info] Number of data points in the train set: 10558, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501326 -> initscore=0.005304
[LightGBM] [Info] Start training from score 0.005304


In [4]:
X

Unnamed: 0,ROA(C) before interest and depreciation before interest,Net Value Per Share (B),Operating Profit Per Share (Yuan ¥),Per Share Net profit before tax (Yuan ¥),Net worth/Assets,Retained Earnings to Total Assets,Total expense/Assets,Current Liability to Current Assets,Liability-Assets Flag,Net Income to Stockholder's Equity
0,0.370594,0.147950,0.095921,0.138736,0.792424,0.903225,0.064856,0.118250,0,0.827890
1,0.464291,0.182251,0.093722,0.169918,0.828824,0.931065,0.025516,0.047775,0,0.839969
2,0.426071,0.177911,0.092338,0.142803,0.792484,0.909903,0.021387,0.025346,0,0.836774
3,0.399844,0.154187,0.077762,0.148603,0.848535,0.906902,0.024161,0.067250,0,0.834697
4,0.465022,0.167502,0.096898,0.168412,0.893491,0.913850,0.026385,0.047725,0,0.839973
...,...,...,...,...,...,...,...,...,...,...
6814,0.493687,0.175045,0.098200,0.172102,0.875382,0.925611,0.019060,0.027951,0,0.840359
6815,0.475162,0.181324,0.098608,0.172780,0.900747,0.932629,0.011118,0.031470,0,0.840306
6816,0.472725,0.269521,0.100073,0.173232,0.961061,0.932000,0.035446,0.007542,0,0.840138
6817,0.506264,0.213392,0.111799,0.185584,0.913021,0.939613,0.016443,0.022916,0,0.841084


In [5]:
y

0       1
1       1
2       1
3       1
4       1
       ..
6814    0
6815    0
6816    0
6817    0
6818    0
Name: Bankrupt?, Length: 6819, dtype: int64