In [7]:
import pandas as pd

# 엑셀 파일 로드
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# CSV 파일 로드 (CP949 인코딩 사용)
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A(상장폐지 기업)의 모든 종목코드가 데이터셋 B(전체 코스닥 기업)에 있는지 확인
delisted_companies_codes = set(kosdaq_delisted_data['종목코드'])
all_kosdaq_companies_codes = set(kosdaq_companies_data['거래소코드'])
missing_companies_codes = delisted_companies_codes - all_kosdaq_companies_codes

# 누락된 회사명 추출
missing_companies_names = kosdaq_delisted_data[kosdaq_delisted_data['종목코드'].isin(missing_companies_codes)]['회사명']
missing_companies_names_list = missing_companies_names.tolist()

# 데이터셋 A에서 누락된 회사들 제거
kosdaq_delisted_data_cleaned = kosdaq_delisted_data[~kosdaq_delisted_data['회사명'].isin(missing_companies_names_list)]


In [10]:
import pandas as pd

# 엑셀 파일 로드
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# CSV 파일 로드 (CP949 인코딩 사용)
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A(상장폐지 기업)의 모든 종목코드가 데이터셋 B(전체 코스닥 기업)에 있는지 확인
delisted_companies_codes = set(kosdaq_delisted_data['종목코드'])
all_kosdaq_companies_codes = set(kosdaq_companies_data['거래소코드'])
missing_companies_codes = delisted_companies_codes - all_kosdaq_companies_codes

# 누락된 회사명 추출
missing_companies_names = kosdaq_delisted_data[kosdaq_delisted_data['종목코드'].isin(missing_companies_codes)]['회사명']
missing_companies_names_list = missing_companies_names.tolist()

# 데이터셋 A에서 누락된 회사들 제거
kosdaq_delisted_data_cleaned = kosdaq_delisted_data[~kosdaq_delisted_data['회사명'].isin(missing_companies_names_list)]

# kosdaq_companies_data에 있는 회사명을 0으로 레이블링
kosdaq_companies_data['부도여부'] = 0

# kosdaq_delisted_data_cleaned에 있는 회사명을 1로 레이블링
kosdaq_delisted_data_cleaned['부도여부'] = 1

# 두 데이터프레임을 병합
combined_data = pd.concat([kosdaq_companies_data, kosdaq_delisted_data_cleaned], ignore_index=True)

# 결과 확인
print(combined_data['부도여부'].value_counts())


0    5395
1     111
Name: 부도여부, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kosdaq_delisted_data_cleaned['부도여부'] = 1


In [9]:
kosdaq_delisted_data_cleaned['회사명']

0       실리콘화일
1        CU전자
2       아라온테크
3        쌍용건설
4      나노트로닉스
        ...  
128       제이콤
129     엠엔에프씨
130      대선조선
131    스톰이앤에프
132     중앙디자인
Name: 회사명, Length: 111, dtype: object

In [15]:
import pandas as pd
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# 데이터셋 B (코스닥 전체 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A (상장폐지 기업) 로드 및 전처리
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 데이터셋 A의 결측치 제거 및 이상치 처리
numeric_columns = kosdaq_delisted_data.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    kosdaq_delisted_data[col] = winsorize(kosdaq_delisted_data[col], limits=[0.01, 0.01])  # 이상치 처리

# 데이터셋 B의 재무비율 칼럼 식별 및 회계년도 형식 변환
financial_ratios_columns = kosdaq_companies_data.columns.drop(['회사명', '거래소코드', '회계년도'])
kosdaq_companies_data['회계년도'] = kosdaq_companies_data['회계년도'].str.split('/').str[0].astype(int)

# 각 재무비율에 대해 파생변수 생성
for col in financial_ratios_columns:
    # 시차 차이 계산
    kosdaq_companies_data[f'{col}_시차차이'] = kosdaq_companies_data.groupby('회사명')[col].diff()
    # 시차 비율 계산
    kosdaq_companies_data[f'{col}_시차비율'] = kosdaq_companies_data.groupby('회사명')[col].pct_change()
    # 연도별 상위 10개 기업 평균과의 차이
    top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)
    kosdaq_companies_data = kosdaq_companies_data.join(top10_avg, on='회계년도', rsuffix='_상위10평균')
    kosdaq_companies_data[f'{col}_상위10차이'] = kosdaq_companies_data[f'{col}_상위10평균'] - kosdaq_companies_data[col]
    # 연도별 전체 평균과의 차이
    overall_avg = kosdaq_companies_data.groupby('회계년도')[col].transform('mean')
    kosdaq_companies_data[f'{col}_전체평균차이'] = overall_avg - kosdaq_companies_data[col]

# 상위 10개 기업 평균 관련 열 제거
top10_avg_columns = [col for col in kosdaq_companies_data.columns if '_상위10평균' in col]
kosdaq_companies_data_cleaned = kosdaq_companies_data.drop(columns=top10_avg_columns)

# 데이터셋 A와 B를 레이블링
kosdaq_delisted_data_cleaned = kosdaq_delisted_data.copy()
kosdaq_delisted_data_cleaned['부도여부'] = 1
kosdaq_companies_data_cleaned['부도여부'] = 0

# 데이터 병합
combined_data = pd.concat([kosdaq_companies_data_cleaned, kosdaq_delisted_data_cleaned], ignore_index=True)

# 독립변수(X)와 종속변수(y) 분리
X = combined_data.drop(columns=['부도여부', '회사명', '거래소코드', '종목코드', '회계년도'])
y = combined_data['부도여부']

# 결측치를 0으로 대체
X = X.fillna(0)


  top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)


In [20]:
import pandas as pd
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# 데이터셋 B (코스닥 전체 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A (상장폐지 기업) 로드 및 전처리
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 데이터셋 A의 결측치 제거 및 이상치 처리
numeric_columns = kosdaq_delisted_data.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    kosdaq_delisted_data[col] = winsorize(kosdaq_delisted_data[col], limits=[0.01, 0.01])  # 이상치 처리

# 데이터셋 B의 재무비율 칼럼 식별 및 회계년도 형식 변환
financial_ratios_columns = kosdaq_companies_data.columns.drop(['회사명', '거래소코드', '회계년도'])
kosdaq_companies_data['회계년도'] = kosdaq_companies_data['회계년도'].str.split('/').str[0].astype(int)

# 각 재무비율에 대해 파생변수 생성
for col in financial_ratios_columns:
    # 시차 차이 계산
    kosdaq_companies_data[f'{col}_시차차이'] = kosdaq_companies_data.groupby('회사명')[col].diff()
    # 시차 비율 계산
    kosdaq_companies_data[f'{col}_시차비율'] = kosdaq_companies_data.groupby('회사명')[col].pct_change()
    # 연도별 상위 10개 기업 평균과의 차이
    top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)
    kosdaq_companies_data = kosdaq_companies_data.join(top10_avg, on='회계년도', rsuffix='_상위10평균')
    kosdaq_companies_data[f'{col}_상위10차이'] = kosdaq_companies_data[f'{col}_상위10평균'] - kosdaq_companies_data[col]
    # 연도별 전체 평균과의 차이
    overall_avg = kosdaq_companies_data.groupby('회계년도')[col].transform('mean')
    kosdaq_companies_data[f'{col}_전체평균차이'] = overall_avg - kosdaq_companies_data[col]

# 상위 10개 기업 평균 관련 열 제거
top10_avg_columns = [col for col in kosdaq_companies_data.columns if '_상위10평균' in col]
kosdaq_companies_data_cleaned = kosdaq_companies_data.drop(columns=top10_avg_columns)

# 데이터셋 A와 B를 레이블링
kosdaq_delisted_data_cleaned = kosdaq_delisted_data.copy()
kosdaq_delisted_data_cleaned['부도여부'] = 1
kosdaq_companies_data_cleaned['부도여부'] = 0

# 데이터 병합
combined_data = pd.concat([kosdaq_companies_data_cleaned, kosdaq_delisted_data_cleaned], ignore_index=True)

financial_ratios_columns = ['매출액증가율(IFRS)', '총자본증가율(IFRS)', '매출액순이익률(IFRS)', '총자본정상영업이익률(IFRS)',
                            'CASH FLOW 대 부채비율(IFRS)', 'CASH FLOW 대 총자본비율(IFRS)', '총자본회전률(IFRS)',
                            '자기자본회전률(IFRS)', '총자본투자효율(IFRS)', '설비투자효율(IFRS)']
X = kosdaq_companies_data_cleaned[financial_ratios_columns]

y = combined_data['부도여부']


# X 데이터프레임과 y 시리즈의 행 수를 맞추기 위해 y에 해당하는 행 삭제
combined_data = pd.concat([kosdaq_companies_data_cleaned, kosdaq_delisted_data_cleaned], ignore_index=True)
X = kosdaq_companies_data_cleaned[financial_ratios_columns]
y = combined_data['부도여부']

# 결측치를 0으로 대체
# X = X.fillna(0)

# SMOTE 적용
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

# RandomForest 모델 생성 및 학습
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train, y_train)

# 모델 성능 평가
y_pred = rf_classifier.predict(X_test)
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

# 특성 중요도 분석 및 시각화
feature_importances = rf_classifier.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
sorted_idx = feature_importance_df['Importance'].argsort()
plt.figure(figsize=(10, 8))
plt.title('Feature Importances')
plt.barh(range(len(feature_importances)), feature_importance_df['Importance'][sorted_idx], align='center')
plt.yticks(range(len(feature_importances)), [features[i] for i in sorted_idx])
plt.xlabel('Relative Importance')
plt.show()

# 모델 성능 보고서 출력
print("Classification Report:\n", classification_report_result)
print("Confusion Matrix:\n", confusion_matrix_result)


  top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)


ValueError: Found input variables with inconsistent numbers of samples: [5395, 5528]

In [2]:
import pandas as pd
from scipy.stats.mstats import winsorize

# 데이터셋 B (코스닥 전체 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A (상장폐지 기업) 로드 및 전처리
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 데이터셋 A의 결측치 제거 및 이상치 처리

numeric_columns = kosdaq_delisted_data.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    kosdaq_delisted_data[col] = winsorize(kosdaq_delisted_data[col], limits=[0.01, 0.01])  # 이상치 처리

# 데이터셋 B의 재무비율 칼럼 식별 및 회계년도 형식 변환
financial_ratios_columns = kosdaq_companies_data.columns.drop(['회사명', '거래소코드', '회계년도'])
kosdaq_companies_data['회계년도'] = kosdaq_companies_data['회계년도'].str.split('/').str[0].astype(int)

# 각 재무비율에 대해 파생변수 생성
for col in financial_ratios_columns:
    # 시차 차이 계산
    kosdaq_companies_data[f'{col}_시차차이'] = kosdaq_companies_data.groupby('회사명')[col].diff()
    # 시차 비율 계산
    kosdaq_companies_data[f'{col}_시차비율'] = kosdaq_companies_data.groupby('회사명')[col].pct_change()
    # 연도별 상위 10개 기업 평균과의 차이
    top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)
    kosdaq_companies_data = kosdaq_companies_data.join(top10_avg, on='회계년도', rsuffix='_상위10평균')
    kosdaq_companies_data[f'{col}_상위10차이'] = kosdaq_companies_data[f'{col}_상위10평균'] - kosdaq_companies_data[col]
    # 연도별 전체 평균과의 차이
    overall_avg = kosdaq_companies_data.groupby('회계년도')[col].transform('mean')
    kosdaq_companies_data[f'{col}_전체평균차이'] = overall_avg - kosdaq_companies_data[col]

    # 상위 10개 기업 평균과 관련된 열의 이름을 찾기
top10_avg_columns = [col for col in kosdaq_companies_data.columns if '_상위10평균' in col]

# 상위 10개 기업 평균 관련 열 제거
kosdaq_companies_data_cleaned = kosdaq_companies_data.drop(columns=top10_avg_columns)


  top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)


In [3]:
#GPT 결측치 처리 코드

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# 재무비율만 선택 (수치형 데이터)
financial_ratios = kosdaq_companies_data_cleaned.select_dtypes(include=['float64', 'int64'])

# 부도기업 레이블링
kosdaq_companies_data_cleaned['부도여부'] = kosdaq_companies_data_cleaned['회사명'].isin(kosdaq_delisted_data['회사명']).astype(int)

# 독립변수(X)와 종속변수(y) 분리
X = financial_ratios
y = kosdaq_companies_data_cleaned['부도여부']

# SMOTE 적용
smote = SMOTE(random_state=0)
# 결측값이 있는 행 삭제
X_resampled, y_resampled = smote.fit_resample(X.dropna(), y[y.index.isin(X.dropna().index)])

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

# RandomForest 모델 생성 및 학습
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train, y_train)

# 모델 성능 평가
y_pred = rf_classifier.predict(X_test)
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

# 특성 중요도 분석 및 시각화
feature_importances = rf_classifier.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
sorted_idx = feature_importance_df['Importance'].argsort()
plt.figure(figsize=(10, 8))
plt.title('Feature Importances')
plt.barh(range(len(feature_importances)), feature_importance_df['Importance'][sorted_idx], align='center')
plt.yticks(range(len(feature_importances)), [features[i] for i in sorted_idx])
plt.xlabel('Relative Importance')
plt.show()


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [6]:
import pandas as pd

# 엑셀 파일 로드
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 데이터셋 B (코스닥 전체 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 부도 여부 레이블 생성 (회사명이 존재하면 1, 존재하지 않으면 0)
kosdaq_companies_data['부도여부'] = kosdaq_companies_data['회사명'].isin(kosdaq_delisted_data['회사명']).astype(int)

# 부도 여부 레이블 확인
print(kosdaq_companies_data['부도여부'].value_counts())


0    5395
Name: 부도여부, dtype: int64


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from scipy.stats.mstats import winsorize

# 데이터셋 B (코스닥 전체 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A (상장폐지 기업) 로드 및 전처리
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 데이터셋 A의 결측치 제거 및 이상치 처리
numeric_columns = kosdaq_delisted_data.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    # 이상치 처리 (1% 범위 Winsorizing)
    kosdaq_delisted_data[col] = winsorize(kosdaq_delisted_data[col], limits=[0.01, 0.01])

# 데이터셋 B의 재무비율 칼럼 식별 및 회계년도 형식 변환
financial_ratios_columns = kosdaq_companies_data.columns.drop(['회사명', '거래소코드', '회계년도'])
kosdaq_companies_data['회계년도'] = kosdaq_companies_data['회계년도'].str.split('/').str[0].astype(int)

# 각 재무비율에 대해 파생변수 생성
for col in financial_ratios_columns:
    # 시차 차이 계산
    kosdaq_companies_data[f'{col}_시차차이'] = kosdaq_companies_data.groupby('회사명')[col].diff()
    # 시차 비율 계산
    kosdaq_companies_data[f'{col}_시차비율'] = kosdaq_companies_data.groupby('회사명')[col].pct_change()
    # 연도별 상위 10개 기업 평균과의 차이
    top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)
    kosdaq_companies_data = kosdaq_companies_data.join(top10_avg, on='회계년도', rsuffix='_상위10평균')
    kosdaq_companies_data[f'{col}_상위10차이'] = kosdaq_companies_data[f'{col}_상위10평균'] - kosdaq_companies_data[col]
    # 연도별 전체 평균과의 차이
    overall_avg = kosdaq_companies_data.groupby('회계년도')[col].transform('mean')
    kosdaq_companies_data[f'{col}_전체평균차이'] = overall_avg - kosdaq_companies_data[col]

# 상위 10개 기업 평균 관련 열 제거
top10_avg_columns = [col for col in kosdaq_companies_data.columns if '_상위10평균' in col]
kosdaq_companies_data_cleaned = kosdaq_companies_data.drop(columns=top10_avg_columns)

# 재무비율만 선택 (수치형 데이터)
financial_ratios = kosdaq_companies_data_cleaned.select_dtypes(include=['float64', 'int64'])

# 부도기업 레이블링
kosdaq_companies_data_cleaned['부도여부'] = kosdaq_companies_data_cleaned['회사명'].isin(kosdaq_delisted_data['회사명']).astype(int)

# 독립변수(X)와 종속변수(y) 분리
X = financial_ratios
y = kosdaq_companies_data_cleaned['부도여부']

# Winsorizing 적용 (하위 1%와 상위 1% 범위에서)
lower_percentile = 0.01  # 변경
upper_percentile = 0.99  # 변경

for column in X.columns:
    X[column] = winsorize(X[column], limits=(lower_percentile, upper_percentile))

# SMOTE 적용
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X.dropna(), y[y.index.isin(X.dropna().index)])

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

# RandomForest 모델 생성 및 학습
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train, y_train)

# 모델 성능 평가
y_pred = rf_classifier.predict(X_test)
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

# 특성 중요도 분석 및 시각화
feature_importances = rf_classifier.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
sorted_idx = feature_importance_df['Importance'].argsort()
plt.figure(figsize=(10, 8))
plt.title('Feature Importances')
plt.barh(range(len(feature_importances)), feature_importance_df['Importance'][sorted_idx], align='center')
plt.yticks(range(len(feature_importances)), [features[i] for i in sorted_idx])
plt.xlabel('Relative Importance')
plt.show()


  top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = winsorize(X[column], limits=(lower_percentile, upper_percentile))


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# 재무비율만 선택 (수치형 데이터)
financial_ratios = kosdaq_companies_data_cleaned.select_dtypes(include=['float64', 'int64'])

# 부도기업 레이블링
kosdaq_companies_data_cleaned['부도여부'] = kosdaq_companies_data_cleaned['회사명'].isin(kosdaq_delisted_data['회사명']).astype(int)


# 독립변수(X)와 종속변수(y) 분리
X = financial_ratios
y = kosdaq_companies_data_cleaned['부도여부']

# SMOTE 적용
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

# RandomForest 모델 생성 및 학습
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train, y_train)

# 모델 성능 평가
y_pred = rf_classifier.predict(X_test)
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

# 특성 중요도 분석 및 시각화
feature_importances = rf_classifier.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
sorted_idx = feature_importance_df['Importance'].argsort()
plt.figure(figsize=(10, 8))
plt.title('Feature Importances')
plt.barh(range(len(feature_importances)), feature_importance_df['Importance'][sorted_idx], align='center')
plt.yticks(range(len(feature_importances)), [features[i] for i in sorted_idx])
plt.xlabel('Relative Importance')
plt.show()

ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [28]:
kosdaq_companies_data_cleaned.head()


Unnamed: 0,회사명,거래소코드,회계년도,매출액증가율(IFRS),총자본증가율(IFRS),매출액순이익률(IFRS),총자본정상영업이익률(IFRS),CASH FLOW 대 부채비율(IFRS),CASH FLOW 대 총자본비율(IFRS),총자본회전률(IFRS),...,자기자본회전률(IFRS)_상위10차이,자기자본회전률(IFRS)_전체평균차이,총자본투자효율(IFRS)_시차차이,총자본투자효율(IFRS)_시차비율,총자본투자효율(IFRS)_상위10차이,총자본투자효율(IFRS)_전체평균차이,설비투자효율(IFRS)_시차차이,설비투자효율(IFRS)_시차비율,설비투자효율(IFRS)_상위10차이,설비투자효율(IFRS)_전체평균차이
0,(주)CMG제약,58820,2010,11.86,17.25,-61.19,-30.38,-69.69,-21.79,0.46,...,12.579,1.047123,,,70.521,6.615046,,,6390.425,-594.196439
1,(주)CMG제약,58820,2011,56.4,2.89,-7.12,2.63,-45.76,-15.38,0.65,...,13.754,0.906655,10.34,3.469799,71.3,-1.978467,30.37,3.923773,10378.596,-560.48626
2,(주)CMG제약,58820,2012,-8.43,106.87,-42.4,-9.57,-7.76,-3.46,0.38,...,14.06,1.165377,-13.47,-1.011261,85.764,15.054752,-39.06,-1.024928,25175.129,374.511887
3,(주)CMG제약,58820,2013,25.02,-1.8,1.65,1.46,-38.65,-4.78,0.36,...,11.718,1.234492,12.57,-83.8,59.064,2.728453,78.39,-82.515789,121329.169,1425.77664
4,(주)CMG제약,58820,2014,11.96,3.87,0.75,0.78,-17.44,-2.72,0.4,...,10.518,1.158322,-0.27,-0.021739,79.753,3.850304,-39.62,-0.511622,3357420.681,37878.647669


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# 데이터셋 B (코스닥 전체 기업) 및 데이터셋 A (상장폐지 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 결측치 제거
kosdaq_companies_data_clean = kosdaq_companies_data.dropna()

# 재무비율만 선택 (수치형 데이터)
financial_ratios = kosdaq_companies_data_clean.select_dtypes(include=['float64', 'int64'])

# 부도기업 레이블링
kosdaq_companies_data_clean['부도여부'] = kosdaq_companies_data_clean['회사명'].isin(kosdaq_delisted_data['회사명']).astype(int)

# 독립변수(X)와 종속변수(y) 분리
X = financial_ratios
y = kosdaq_companies_data_clean['부도여부']

# SMOTE 적용
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

# RandomForest 모델 생성 및 학습
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train, y_train)

# 모델 성능 평가
y_pred = rf_classifier.predict(X_test)
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

# 특성 중요도 분석 및 시각화
feature_importances = rf_classifier.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
sorted_idx = feature_importance_df['Importance'].argsort()
plt.figure(figsize=(10, 8))
plt.title('Feature Importances')
plt.barh(range(len(feature_importances)), feature_importance_df['Importance'][sorted_idx], align='center')
plt.yticks(range(len(feature_importances)), [features[i] for i in sorted_idx])
plt.xlabel('Relative Importance')
plt.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kosdaq_companies_data_clean['부도여부'] = kosdaq_companies_data_clean['회사명'].isin(kosdaq_delisted_data['회사명']).astype(int)


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [None]:
kos