In [2]:
import pandas as pd
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# 데이터셋 B (코스닥 전체 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A (상장폐지 기업) 로드 및 전처리
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 데이터셋 A의 결측치 제거 및 이상치 처리
numeric_columns = kosdaq_delisted_data.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    kosdaq_delisted_data[col] = winsorize(kosdaq_delisted_data[col], limits=[0.01, 0.01])  # 이상치 처리

# 데이터셋 B의 재무비율 칼럼 식별 및 회계년도 형식 변환
financial_ratios_columns = kosdaq_companies_data.columns.drop(['회사명', '거래소코드', '회계년도'])
kosdaq_companies_data['회계년도'] = kosdaq_companies_data['회계년도'].str.split('/').str[0].astype(int)

# 각 재무비율에 대해 파생변수 생성
for col in financial_ratios_columns:
    # 시차 차이 계산
    kosdaq_companies_data[f'{col}_시차차이'] = kosdaq_companies_data.groupby('회사명')[col].diff()
    # 시차 비율 계산
    kosdaq_companies_data[f'{col}_시차비율'] = kosdaq_companies_data.groupby('회사명')[col].pct_change()
    # 연도별 상위 10개 기업 평균과의 차이
    top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)
    kosdaq_companies_data = kosdaq_companies_data.join(top10_avg, on='회계년도', rsuffix='_상위10평균')
    kosdaq_companies_data[f'{col}_상위10차이'] = kosdaq_companies_data[f'{col}_상위10평균'] - kosdaq_companies_data[col]
    # 연도별 전체 평균과의 차이
    overall_avg = kosdaq_companies_data.groupby('회계년도')[col].transform('mean')
    kosdaq_companies_data[f'{col}_전체평균차이'] = overall_avg - kosdaq_companies_data[col]

# 상위 10개 기업 평균 관련 열 제거
top10_avg_columns = [col for col in kosdaq_companies_data.columns if '_상위10평균' in col]
kosdaq_companies_data_cleaned = kosdaq_companies_data.drop(columns=top10_avg_columns)

# 데이터셋 A와 B를 레이블링
kosdaq_delisted_data_cleaned = kosdaq_delisted_data.copy()
kosdaq_delisted_data_cleaned['부도여부'] = 1
kosdaq_companies_data_cleaned['부도여부'] = 0

# 데이터 병합
combined_data = pd.concat([kosdaq_companies_data_cleaned, kosdaq_delisted_data_cleaned], ignore_index=True)

financial_ratios_columns = ['매출액증가율(IFRS)', '총자본증가율(IFRS)', '매출액순이익률(IFRS)', '총자본정상영업이익률(IFRS)',
                            'CASH FLOW 대 부채비율(IFRS)', 'CASH FLOW 대 총자본비율(IFRS)', '총자본회전률(IFRS)',
                            '자기자본회전률(IFRS)', '총자본투자효율(IFRS)', '설비투자효율(IFRS)']

X = kosdaq_companies_data_cleaned[financial_ratios_columns]

# 결측치를 0으로 대체
# X = X.fillna(0)

y = combined_data['부도여부']

# SMOTE 적용
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

# RandomForest 모델 생성 및 학습
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train, y_train)

# 모델 성능 평가
y_pred = rf_classifier.predict(X_test)
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

# 특성 중요도 분석 및 시각화
feature_importances = rf_classifier.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
sorted_idx = feature_importance_df['Importance'].argsort()
plt.figure(figsize=(10, 8))
plt.title('Feature Importances')
plt.barh(range(len(feature_importances)), feature_importance_df['Importance'][sorted_idx], align='center')
plt.yticks(range(len(feature_importances)), [features[i] for i in sorted_idx])
plt.xlabel('Relative Importance')
plt.show()

# 모델 성능 보고서 출력
print("Classification Report:\n", classification_report_result)
print("Confusion Matrix:\n", confusion_matrix_result)


  top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)


ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [6]:
import pandas as pd
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# 데이터셋 B (코스닥 전체 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A (상장폐지 기업) 로드 및 전처리
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 데이터셋 A의 결측치 제거 및 이상치 처리
numeric_columns = kosdaq_delisted_data.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    kosdaq_delisted_data[col] = winsorize(kosdaq_delisted_data[col], limits=[0.01, 0.01])  # 이상치 처리

# 데이터셋 B의 재무비율 칼럼 식별 및 회계년도 형식 변환
financial_ratios_columns = kosdaq_companies_data.columns.drop(['회사명', '거래소코드', '회계년도'])
kosdaq_companies_data['회계년도'] = kosdaq_companies_data['회계년도'].str.split('/').str[0].astype(int)

# 각 재무비율에 대해 파생변수 생성
for col in financial_ratios_columns:
    # 시차 차이 계산
    kosdaq_companies_data[f'{col}_시차차이'] = kosdaq_companies_data.groupby('회사명')[col].diff()
    # 시차 비율 계산
    kosdaq_companies_data[f'{col}_시차비율'] = kosdaq_companies_data.groupby('회사명')[col].pct_change()
    # 연도별 상위 10개 기업 평균과의 차이
    top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)
    kosdaq_companies_data = kosdaq_companies_data.join(top10_avg, on='회계년도', rsuffix='_상위10평균')
    kosdaq_companies_data[f'{col}_상위10차이'] = kosdaq_companies_data[f'{col}_상위10평균'] - kosdaq_companies_data[col]
    # 연도별 전체 평균과의 차이
    overall_avg = kosdaq_companies_data.groupby('회계년도')[col].transform('mean')
    kosdaq_companies_data[f'{col}_전체평균차이'] = overall_avg - kosdaq_companies_data[col]

# 상위 10개 기업 평균 관련 열 제거
top10_avg_columns = [col for col in kosdaq_companies_data.columns if '_상위10평균' in col]
kosdaq_companies_data_cleaned = kosdaq_companies_data.drop(columns=top10_avg_columns)

# 데이터셋 A와 B를 레이블링
kosdaq_delisted_data_cleaned = kosdaq_delisted_data.copy()
kosdaq_delisted_data_cleaned['부도여부'] = 1
kosdaq_companies_data_cleaned['부도여부'] = 0

# 데이터 병합
combined_data = pd.concat([kosdaq_companies_data_cleaned, kosdaq_delisted_data_cleaned], ignore_index=True)

financial_ratios_columns = ['매출액증가율(IFRS)', '총자본증가율(IFRS)', '매출액순이익률(IFRS)', '총자본정상영업이익률(IFRS)',
                            'CASH FLOW 대 부채비율(IFRS)', 'CASH FLOW 대 총자본비율(IFRS)', '총자본회전률(IFRS)',
                            '자기자본회전률(IFRS)', '총자본투자효율(IFRS)', '설비투자효율(IFRS)']

X = kosdaq_companies_data_cleaned[financial_ratios_columns]

# 결측치를 0으로 대체
X = X.fillna(0)

y = combined_data['부도여부']

# y의 개수를 일치시키기 위해 자르기
y = y[:len(X)]

# SMOTE 적용
smote = SMOTE(random_state=0, sampling_strategy='minority')  # 'minority' 클래스 쪽만 조정
X_resampled, y_resampled = smote.fit_resample(X, y)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

# RandomForest 모델 생성 및 학습
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train, y_train)

# 모델 성능 평가
y_pred = rf_classifier.predict(X_test)
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

# 특성 중요도 분석 및 시각화
feature_importances = rf_classifier.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
sorted_idx = feature_importance_df['Importance'].argsort()
plt.figure(figsize=(10, 8))
plt.title('Feature Importances')
plt.barh(range(len(feature_importances)), feature_importance_df['Importance'][sorted_idx], align='center')
plt.yticks(range(len(feature_importances)), [features[i] for i in sorted_idx])
plt.xlabel('Relative Importance')
plt.show()

# 모델 성능 보고서 출력
print("Classification Report:\n", classification_report_result)
print("Confusion Matrix:\n", confusion_matrix_result)


  top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [8]:
X.columns

Index(['매출액증가율(IFRS)', '총자본증가율(IFRS)', '매출액순이익률(IFRS)', '총자본정상영업이익률(IFRS)',
       'CASH FLOW 대 부채비율(IFRS)', 'CASH FLOW 대 총자본비율(IFRS)', '총자본회전률(IFRS)',
       '자기자본회전률(IFRS)', '총자본투자효율(IFRS)', '설비투자효율(IFRS)'],
      dtype='object')