In [22]:
import pandas as pd 
import os
from typing import List

def merge_func_list(file_path: str, file_name: str) -> List[pd.DataFrame]:
    """
    지정된 경로에서 특정 파일 이름이 포함된 모든 Excel 파일을 찾아서 하나의 DataFrame으로 합칩니다.
    
    Parameters:
    file_path (str): 검색할 디렉토리 경로
    file_name (str): 찾을 파일 이름의 일부
    
    Returns:
    pd.DataFrame: 합쳐진 DataFrame
    """
    df_total = []
    
    try:
        # 디렉토리 내의 모든 파일 검색
        for file in os.listdir(file_path):
            # 파일이 Excel 파일이고 지정된 이름을 포함하는지 확인
            if (file.endswith('.xlsx') or file.endswith('.xls')) and file_name.lower() in file.lower():
                # 전체 파일 경로 생성
                full_path = os.path.join(file_path, file)
                
                try:
                    # Excel 파일 읽기
                    df_xlsx = pd.read_excel(io=full_path, header=2)
                    
                    # DataFrame 합치기
                    # df_total = pd.concat([df_total, df_xlsx], axis=0, ignore_index=True)
                    df_total.append(df_xlsx)

                except Exception as e:
                    print(f"파일 '{file}' 읽기 중 오류 발생: {str(e)}")
                    continue
        

        return df_total
    
    except Exception as e:
        print(f"오류 발생: {str(e)}")
        return pd.DataFrame()

In [23]:
df_data_list = merge_func_list(f"../datasets/", f"_지수")
print(len(df_data_list))


4


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


In [29]:
df_data_list[2].head()

Unnamed: 0.1,Unnamed: 0,202001월,202002월,202003월,202004월,202005월,202006월,202007월,202008월,202009월,...,202306월,202307월,202308월,202309월,202310월,202311월,202312월,202401월,202402월,202403월
0,소비자물가,1.2,0.9,0.8,0.0,-0.2,0.2,0.4,0.8,0.9,...,2.7,2.4,3.4,3.7,3.8,3.3,3.2,2.8,3.1,3.1
1,- 농축수산물,1.3,-0.7,3.3,1.0,3.6,6.1,8.8,12.3,12.8,...,0.6,0.1,3.2,4.4,8.0,7.2,7.7,8.0,11.4,11.7
2,- 공업제품,2.3,2.2,1.3,-0.5,-1.8,-1.2,-0.5,-0.5,-0.8,...,0.3,0.1,2.6,3.4,3.6,2.5,2.1,1.8,2.1,2.2
3,- 집세,-0.1,-0.1,0.1,0.0,0.1,0.2,0.2,0.3,0.4,...,0.5,0.3,0.2,0.1,0.0,0.0,-0.1,-0.2,-0.1,0.0
4,- 공공서비스,-1.6,-1.6,-0.5,-1.6,-1.8,-1.8,-1.8,-1.7,-0.7,...,0.8,1.0,1.5,1.6,2.0,2.1,1.9,2.2,2.0,2.0


In [33]:
for df in df_data_list:    
    # '출처'가 포함된 행의 인덱스 찾기
    source_indices = df.apply(lambda x: x.astype(str).str.contains('출처', na=False)).any(axis=1)
    
    if source_indices.any():
        # '출처'가 처음 나타나는 행의 인덱스
        first_source_idx = source_indices[source_indices].index[0]
        # 해당 행 이전까지의 데이터만 선택
        df = df.iloc[:first_source_idx]

In [35]:
df_data_list[0].head()

Unnamed: 0.1,Unnamed: 0,20201/4,20202/4,20203/4,20204/4,20211/4,20212/4,20213/4,20214/4,20221/4,20222/4,20223/4,20224/4,20231/4,20232/4,20233/4,20234/4,20241/4,20242/4,20243/4
0,국내총생산(명목GDP),489061.0,502861.3,524939.5,541604.8,515116.3,548739.9,568371.7,589685.1,553664.8,581793.9,588594.0,599728.8,565099.2,592222.8,611097.6,632769.8,606213.9,634718.4,641917.7
1,경제성장률(실질GDP성장률),1.2,-2.6,-0.8,-0.5,2.5,7.0,4.6,4.3,3.5,3.0,3.4,1.1,1.1,1.0,1.4,2.1,3.3,2.3,1.5
2,출처:,한국은행「국민소득」,,,,,,,,,,,,,,,,,,
3,주석:,"* 국민총생산(명목, 시장가격)",,,,,,,,,,,,,,,,,,
4,,"* 실질GDP, 실질성장률은 발표시기(한국은행, GDP속보치 발표)와 명목GDP, ...",,,,,,,,,,,,,,,,,,


In [31]:
df_data_transe_list = []
for df in df_data_list:
    df_data_transe_list.append(df.transpose())

for df in df_data_transe_list:
     print(df.head())

                       0                1           2                  3  \
Unnamed: 0  국내총생산(명목GDP)  경제성장률(실질GDP성장률)         출처:                주석:   
20201/4        489,061.0              1.2  한국은행「국민소득」  * 국민총생산(명목, 시장가격)   
20202/4        502,861.3             -2.6         NaN                NaN   
20203/4        524,939.5             -0.8         NaN                NaN   
20204/4        541,604.8             -0.5         NaN                NaN   

                                                            4  
Unnamed: 0                                                NaN  
20201/4     * 실질GDP, 실질성장률은 발표시기(한국은행, GDP속보치 발표)와 명목GDP, ...  
20202/4                                                   NaN  
20203/4                                                   NaN  
20204/4                                                   NaN  
                      0       1       2  \
Unnamed: 0  건설투자(국내총생산)  국내건설수주  건축허가면적   
20201/4             5.5    -1.0    -5.1   
20202/4             2.2    26.

In [32]:
df_data_transe_list[2].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Unnamed: 0,소비자물가,- 농축수산물,- 공업제품,- 집세,- 공공서비스,- 개인서비스,근원물가,생활물가,출처:,주석:
202001월,1.2,1.3,2.3,-0.1,-1.6,1.7,0.7,1.7,통계청「소비자물가지수」,2020년을 100으로 458개 조사항목을 가중평균하여 매월 지수를 산출 발표
202002월,0.9,-0.7,2.2,-0.1,-1.6,1.1,0.5,1.2,,
202003월,0.8,3.3,1.3,0.1,-0.5,0.5,0.5,1.4,,
202004월,0.0,1.0,-0.5,0.0,-1.6,1.0,0.3,0.2,,


In [13]:
import matplotlib.pyplot as plt
import matplotlib as mpl # 한글 폰트 설정 (NanumGothic) 
mpl.rcParams['font.family'] = 'NanumGothic'
mpl.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지

In [None]:
fig, axes = plt.subplots(len(df_data_transe_list),1,figsize=(10,20), dpi=100) # 외곽 사이즈

fig.suptitle('국가별 kospi와 종가 비교 (1990~)')

#kospi_close = serise_global_close_list[0]
for num, (global_index, global_name) in enumerate(zip(serise_global_close_list[1:], global_name_list[1:])):
    axes[num].plot(kospi_close.index, kospi_close, label='kospi' )
    axes[num].plot(global_index.index, global_index, label=global_name)
    # axes[num].set_title('axes first')
    axes[num].set_xlabel('년도')
    axes[num].set_ylabel('지수')
    axes[num].legend()


# plt.tight_layout()
plt.show()