In [3]:
import pandas as pd
from scipy import stats

# 파일 불러오기 및 기본 구조 확인
file_path = 'cleaned_111.csv' 
df = pd.read_csv(file_path)
print(df.head()) # 데이터의 처음 몇 행을 출력

# 기본 정보 및 결측치 확인
print(df.info()) # 데이터의 기본 정보 출력
print(df.isnull().sum()) # 각 컬럼별 결측치 수 확인

# 회사별 재무 상태 변화 분석
# 데이터를 회사명과 회계년도로 그룹화하여 평균값 계산
df_grouped = df.groupby(['회사명', '회계년도']).mean()
print(df_grouped.head()) # 결과 확인

# t-test 수행
# 2010년과 2019년 데이터를 각각 분리
df_2010 = df[df['회계년도'] == '2010/12']
df_2019 = df[df['회계년도'] == '2019/12']

# '매출액증가율(IFRS)'에 대한 t-test 수행
t_test_result = stats.ttest_ind(df_2010['매출액증가율(IFRS)'].dropna(), 
                                df_2019['매출액증가율(IFRS)'].dropna(), 
                                equal_var=False)
print(t_test_result) # t-test 결과 출력

# 파생변수 생성
# 데이터를 회사명으로 정렬
df_sorted = df.sort_values(['회사명', '회계년도'])

# '매출액순이익률(IFRS)'의 연도별 차이와 비율을 계산
df_sorted['매출액순이익률_diff'] = df_sorted.groupby('회사명')['매출액순이익률(IFRS)'].diff()
df_sorted['매출액순이익률_ratio'] = df_sorted.groupby('회사명')['매출액순이익률(IFRS)'].pct_change()
print(df_sorted[['회사명', '회계년도', '매출액순이익률_diff', '매출액순이익률_ratio']].head()) # 결과 확인

# 상위 10개 기업 선정 및 Gap Model 변수 생성
# '매출액순이익률(IFRS)' 기준 상위 10개 기업 추출
top_10_companies = df_sorted.groupby('회사명')['매출액순이익률(IFRS)'].mean().sort_values(ascending=False).head(10)

# 상위 10개 기업의 평균 계산
top_10_average = top_10_companies.mean()

# 산업 전체 평균 계산
industry_average = df_sorted['매출액순이익률(IFRS)'].mean()

# Gap Model 변수 생성
df_sorted['Top_10_Gap_Diff'] = df_sorted['매출액순이익률(IFRS)'] - top_10_average
df_sorted['Top_10_Gap_Ratio'] = df_sorted['매출액순이익률(IFRS)'] / top_10_average
df_sorted['Average_Gap_Diff'] = df_sorted['매출액순이익률(IFRS)'] - industry_average
df_sorted['Average_Gap_Ratio'] = df_sorted['매출액순이익률(IFRS)'] / industry_average
print(df_sorted[['회사명', '회계년도', '매출액순이익률(IFRS)', 'Top_10_Gap_Diff', 'Top_10_Gap_Ratio', 'Average_Gap_Diff', 'Average_Gap_Ratio']].head()) # 결과 확인


        회사명  거래소코드     회계년도  매출액증가율(IFRS)  총자본증가율(IFRS)  매출액순이익률(IFRS)  \
0  (주)CMG제약  58820  2010/12         11.86         17.25         -61.19   
1  (주)CMG제약  58820  2011/12         56.40          2.89          -7.12   
2  (주)CMG제약  58820  2012/12         -8.43        106.87         -42.40   
3  (주)CMG제약  58820  2013/12         25.02         -1.80           1.65   
4  (주)CMG제약  58820  2014/12         11.96          3.87           0.75   

   총자본정상영업이익률(IFRS)  CASH FLOW 대 부채비율(IFRS)  CASH FLOW 대 총자본비율(IFRS)  \
0            -30.38                  -69.69                   -21.79   
1              2.63                  -45.76                   -15.38   
2             -9.57                   -7.76                    -3.46   
3              1.46                  -38.65                    -4.78   
4              0.78                  -17.44                    -2.72   

   총자본회전률(IFRS)  자기자본회전률(IFRS)  총자본투자효율(IFRS)  설비투자효율(IFRS)  
0          0.46           0.82           2.98          7.74 

In [5]:
df_sorted.head(10)

Unnamed: 0,회사명,거래소코드,회계년도,매출액증가율(IFRS),총자본증가율(IFRS),매출액순이익률(IFRS),총자본정상영업이익률(IFRS),CASH FLOW 대 부채비율(IFRS),CASH FLOW 대 총자본비율(IFRS),총자본회전률(IFRS),자기자본회전률(IFRS),총자본투자효율(IFRS),설비투자효율(IFRS),매출액순이익률_diff,매출액순이익률_ratio,Top_10_Gap_Diff,Top_10_Gap_Ratio,Average_Gap_Diff,Average_Gap_Ratio
0,(주)CMG제약,58820,2010/12,11.86,17.25,-61.19,-30.38,-69.69,-21.79,0.46,0.82,2.98,7.74,,,-774.493933,-0.085784,-59.355076,33.347429
1,(주)CMG제약,58820,2011/12,56.4,2.89,-7.12,2.63,-45.76,-15.38,0.65,0.97,13.32,38.11,54.07,-0.883641,-720.423933,-0.009982,-5.285076,3.88027
2,(주)CMG제약,58820,2012/12,-8.43,106.87,-42.4,-9.57,-7.76,-3.46,0.38,0.65,-0.15,-0.95,-35.28,4.955056,-755.703933,-0.059442,-40.565076,23.107224
3,(주)CMG제약,58820,2013/12,25.02,-1.8,1.65,1.46,-38.65,-4.78,0.36,0.5,12.42,77.44,44.05,-1.038915,-711.653933,0.002313,3.484924,-0.89922
4,(주)CMG제약,58820,2014/12,11.96,3.87,0.75,0.78,-17.44,-2.72,0.4,0.46,12.15,37.82,-0.9,-0.545455,-712.553933,0.001051,2.584924,-0.408736
5,(주)CMG제약,58820,2015/12,18.85,-2.64,-13.49,-4.48,16.16,2.78,0.47,0.56,7.63,24.22,-14.24,-18.986667,-726.793933,-0.018912,-11.655076,7.351803
6,(주)ES큐브,50120,2010/12,22.59,1.32,-6.25,1.22,78.83,9.46,0.73,0.84,12.09,362.31,,,-719.553933,-0.008762,-4.415076,3.406136
7,(주)ES큐브,50120,2011/12,20.38,-5.43,-7.03,-4.47,49.77,6.7,0.9,1.03,17.6,565.75,-0.78,0.1248,-720.333933,-0.009856,-5.195076,3.831221
8,(주)ES큐브,50120,2012/12,5.94,-1.23,1.17,1.44,65.12,6.99,0.98,1.12,26.0,832.22,8.2,-1.16643,-712.133933,0.00164,3.004924,-0.637629
9,(주)ES큐브,50120,2013/12,3.72,6.34,-32.19,0.47,120.2,9.61,0.99,1.1,-12.05,-625.54,-33.36,-28.512821,-745.493933,-0.045128,-30.355076,17.542961


In [None]:
pd.to_csv(df_sorted)

In [2]:
# 파일을 다시 불러오기
df1_updated = pd.read_csv('kosdaq_2012_1parm.csv', encoding='cp949')
df2_updated = pd.read_csv('kosdaq_2012_1parm_live.csv', encoding='cp949')

# 각 파일에서 회사명만 추출
companies_1_updated = set(df1_updated['회사명'])
companies_2_updated = set(df2_updated['회사명'])

# 첫 번째 파일에만 있는 회사명 추출
unique_companies_in_1_not_in_2_updated = companies_1_updated - companies_2_updated

# 결과 출력
unique_companies_in_1_not_in_2_updated

set()

In [6]:
# 각 CSV 파일에서 '회사명'이 '(주)맘스터치앤컴퍼니'인 행 찾기
moms_touch_and_company_df1 = df1_updated[df1_updated['회사명'] == '(주)맘스터치앤컴퍼니']
moms_touch_and_company_df2 = df2_updated[df2_updated['회사명'] == '(주)맘스터치앤컴퍼니']

# 결과 확인
moms_touch_and_company_df1, moms_touch_and_company_df2

(Empty DataFrame
 Columns: [회사명, 거래소코드, 회계년도, 매출액증가율(IFRS), 매출액순이익률(IFRS), 총자본투자효율(IFRS), CASH FLOW 대 부채비율(IFRS), 총자본회전률(IFRS)]
 Index: [],
 Empty DataFrame
 Columns: [회사명, 거래소코드, 회계년도, 매출액증가율(IFRS), 매출액순이익률(IFRS), 총자본투자효율(IFRS), CASH FLOW 대 부채비율(IFRS), 총자본회전률(IFRS)]
 Index: [])

In [3]:
df1_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   회사명                     804 non-null    object 
 1   거래소코드                   804 non-null    int64  
 2   회계년도                    804 non-null    object 
 3   매출액증가율(IFRS)            797 non-null    float64
 4   매출액순이익률(IFRS)           797 non-null    float64
 5   총자본투자효율(IFRS)           797 non-null    float64
 6   CASH FLOW 대 부채비율(IFRS)  797 non-null    float64
 7   총자본회전률(IFRS)            797 non-null    float64
dtypes: float64(5), int64(1), object(2)
memory usage: 50.4+ KB


In [4]:
df2_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804 entries, 0 to 803
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   회사명                     804 non-null    object 
 1   거래소코드                   804 non-null    int64  
 2   회계년도                    804 non-null    object 
 3   매출액증가율(IFRS)            797 non-null    float64
 4   매출액순이익률(IFRS)           797 non-null    float64
 5   총자본투자효율(IFRS)           797 non-null    float64
 6   CASH FLOW 대 부채비율(IFRS)  797 non-null    float64
 7   총자본회전률(IFRS)            797 non-null    float64
dtypes: float64(5), int64(1), object(2)
memory usage: 50.4+ KB
