In [2]:
import pandas as pd

# CSV 파일 로드 (한국어 인코딩 'cp949' 사용)
file_path = '111.csv'
df = pd.read_csv(file_path, encoding='cp949')

# 결측치가 포함된 행 삭제
df_clean = df.dropna()

# 결측치가 제거된 데이터프레임 저장
clean_file_path = 'cleaned_111.csv'
df_clean.to_csv(clean_file_path, index=False)

# 2011, 2012, 2013, 2014년 회계년도 데이터를 모두 가지고 있는 회사 확인
required_years = ['2011/12', '2012/12', '2013/12', '2014/12']
company_year_coverage = df_clean.groupby('회사명')['회계년도'].apply(lambda x: all(year in x.values for year in required_years))

# 필요한 회계년도를 모두 갖고 있는 회사만 필터링
companies_with_all_years = company_year_coverage[company_year_coverage].index
df_filtered_companies = df_clean[df_clean['회사명'].isin(companies_with_all_years)]

# 필요한 회계년도만 남김
df_final = df_filtered_companies[df_filtered_companies['회계년도'].isin(required_years)]

# 최종 필터링된 데이터프레임 저장
final_file_path = 'final_111.csv'
df_final.to_csv(final_file_path, index=False)


In [3]:
import pandas as pd

# 엑셀 파일 로드
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# CSV 파일 로드 (CP949 인코딩 사용)
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A(상장폐지 기업)의 모든 종목코드가 데이터셋 B(전체 코스닥 기업)에 있는지 확인
delisted_companies_codes = set(kosdaq_delisted_data['종목코드'])
all_kosdaq_companies_codes = set(kosdaq_companies_data['거래소코드'])
missing_companies_codes = delisted_companies_codes - all_kosdaq_companies_codes

# 누락된 회사명 추출
missing_companies_names = kosdaq_delisted_data[kosdaq_delisted_data['종목코드'].isin(missing_companies_codes)]['회사명']
missing_companies_names_list = missing_companies_names.tolist()

# 데이터셋 A에서 누락된 회사들 제거
kosdaq_delisted_data_cleaned = kosdaq_delisted_data[~kosdaq_delisted_data['회사명'].isin(missing_companies_names_list)]


In [4]:
import pandas as pd
from scipy.stats.mstats import winsorize

# 데이터셋 B (코스닥 전체 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')

# 데이터셋 A (상장폐지 기업) 로드 및 전처리
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 데이터셋 A의 결측치 제거 및 이상치 처리
kosdaq_delisted_data_cleaned = kosdaq_delisted_data.dropna()  # 결측치 제거
numeric_columns = kosdaq_delisted_data_cleaned.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    kosdaq_delisted_data_cleaned[col] = winsorize(kosdaq_delisted_data_cleaned[col], limits=[0.01, 0.01])  # 이상치 처리

# 데이터셋 B의 재무비율 칼럼 식별 및 회계년도 형식 변환
financial_ratios_columns = kosdaq_companies_data.columns.drop(['회사명', '거래소코드', '회계년도'])
kosdaq_companies_data['회계년도'] = kosdaq_companies_data['회계년도'].str.split('/').str[0].astype(int)

# 각 재무비율에 대해 파생변수 생성
for col in financial_ratios_columns:
    # 시차 차이 계산
    kosdaq_companies_data[f'{col}_시차차이'] = kosdaq_companies_data.groupby('회사명')[col].diff()
    # 시차 비율 계산
    kosdaq_companies_data[f'{col}_시차비율'] = kosdaq_companies_data.groupby('회사명')[col].pct_change()
    # 연도별 상위 10개 기업 평균과의 차이
    top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)
    kosdaq_companies_data = kosdaq_companies_data.join(top10_avg, on='회계년도', rsuffix='_상위10평균')
    kosdaq_companies_data[f'{col}_상위10차이'] = kosdaq_companies_data[f'{col}_상위10평균'] - kosdaq_companies_data[col]
    # 연도별 전체 평균과의 차이
    overall_avg = kosdaq_companies_data.groupby('회계년도')[col].transform('mean')
    kosdaq_companies_data[f'{col}_전체평균차이'] = overall_avg - kosdaq_companies_data[col]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kosdaq_delisted_data_cleaned[col] = winsorize(kosdaq_delisted_data_cleaned[col], limits=[0.01, 0.01])  # 이상치 처리
  top10_avg = kosdaq_companies_data.groupby('회계년도')[col].nlargest(10).mean(level=0)


In [5]:
kosdaq_companies_data.head()


Unnamed: 0,회사명,거래소코드,회계년도,매출액증가율(IFRS),총자본증가율(IFRS),매출액순이익률(IFRS),총자본정상영업이익률(IFRS),CASH FLOW 대 부채비율(IFRS),CASH FLOW 대 총자본비율(IFRS),총자본회전률(IFRS),...,총자본투자효율(IFRS)_시차차이,총자본투자효율(IFRS)_시차비율,총자본투자효율(IFRS)_상위10평균,총자본투자효율(IFRS)_상위10차이,총자본투자효율(IFRS)_전체평균차이,설비투자효율(IFRS)_시차차이,설비투자효율(IFRS)_시차비율,설비투자효율(IFRS)_상위10평균,설비투자효율(IFRS)_상위10차이,설비투자효율(IFRS)_전체평균차이
0,(주)CMG제약,58820,2010,11.86,17.25,-61.19,-30.38,-69.69,-21.79,0.46,...,,,73.501,70.521,6.615046,,,6398.165,6390.425,-594.196439
1,(주)CMG제약,58820,2011,56.4,2.89,-7.12,2.63,-45.76,-15.38,0.65,...,10.34,3.469799,84.62,71.3,-1.978467,30.37,3.923773,10416.706,10378.596,-560.48626
2,(주)CMG제약,58820,2012,-8.43,106.87,-42.4,-9.57,-7.76,-3.46,0.38,...,-13.47,-1.011261,85.614,85.764,15.054752,-39.06,-1.024928,25174.179,25175.129,374.511887
3,(주)CMG제약,58820,2013,25.02,-1.8,1.65,1.46,-38.65,-4.78,0.36,...,12.57,-83.8,71.484,59.064,2.728453,78.39,-82.515789,121406.609,121329.169,1425.77664
4,(주)CMG제약,58820,2014,11.96,3.87,0.75,0.78,-17.44,-2.72,0.4,...,-0.27,-0.021739,91.903,79.753,3.850304,-39.62,-0.511622,3357458.501,3357420.681,37878.647669


In [10]:
# 상위 10개 기업 평균과 관련된 열의 이름을 찾기
top10_avg_columns = [col for col in kosdaq_companies_data.columns if '_상위10평균' in col]

# 상위 10개 기업 평균 관련 열 제거
kosdaq_companies_data_cleaned = kosdaq_companies_data.drop(columns=top10_avg_columns)


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# 데이터셋 B (코스닥 전체 기업) 및 데이터셋 A (상장폐지 기업) 로드
kosdaq_companies_file_path = '111.csv'
kosdaq_companies_data = pd.read_csv(kosdaq_companies_file_path, encoding='CP949')
kosdaq_delisted_file_path = '상장폐지기업사유(피흡수합병제외).xlsx'
kosdaq_delisted_data = pd.read_excel(kosdaq_delisted_file_path)

# 결측치 제거
kosdaq_companies_data_clean = kosdaq_companies_data.dropna()

# 재무비율만 선택 (수치형 데이터)
financial_ratios = kosdaq_companies_data_clean.select_dtypes(include=['float64', 'int64'])

# 부도기업 레이블링
kosdaq_companies_data_clean['부도여부'] = kosdaq_companies_data_clean['회사명'].isin(kosdaq_delisted_data['회사명']).astype(int)

# 독립변수(X)와 종속변수(y) 분리
X = financial_ratios
y = kosdaq_companies_data_clean['부도여부']

# SMOTE 적용
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

# RandomForest 모델 생성 및 학습
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train, y_train)

# 모델 성능 평가
y_pred = rf_classifier.predict(X_test)
classification_report_result = classification_report(y_test, y_pred)
confusion_matrix_result = confusion_matrix(y_test, y_pred)

# 특성 중요도 분석 및 시각화
feature_importances = rf_classifier.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
sorted_idx = feature_importance_df['Importance'].argsort()
plt.figure(figsize=(10, 8))
plt.title('Feature Importances')
plt.barh(range(len(feature_importances)), feature_importance_df['Importance'][sorted_idx], align='center')
plt.yticks(range(len(feature_importances)), [features[i] for i in sorted_idx])
plt.xlabel('Relative Importance')
plt.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kosdaq_companies_data_clean['부도여부'] = kosdaq_companies_data_clean['회사명'].isin(kosdaq_delisted_data['회사명']).astype(int)


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [None]:
kos