In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

import os

In [19]:
# 읽고자 하는 폴더의 경로
folder_path = './makeDataset/dataset/processedData/'

# 폴더 내의 모든 파일과 폴더 목록을 가져옴
file_names = os.listdir(folder_path)

result = dict()

for file_name in file_names:
    df = pd.read_csv(folder_path + file_name)
    
    df = df.drop(['행정구역(시도)','행정구역(시군구)','연도', '행정구역', 'Unnamed: 0', '총인구수-학령인구(명)'], axis=1)
    
    corrResult = df.corr()['학령인구(명)']
    
    result.update({file_name:corrResult})


In [35]:
# result dictionary를 dataframe으로 변환

result_df = pd.DataFrame(result)

In [36]:
result_df = result_df.T

In [37]:
result_df.head()

Unnamed: 0,총인구수(명),유치원 수,초등학교 수,출생건수,사망건수,혼인건수,이혼건수,학령인구(명)
76_0.3.csv,0.784826,0.80594,0.731358,0.763676,0.726729,0.755446,0.781019,1.0
66_0.3.csv,0.785396,0.80299,0.725554,0.765101,0.727829,0.757295,0.782574,1.0
6_0.6.csv,0.925205,0.851072,0.805365,0.911414,0.837669,0.913185,0.905101,1.0
41_0.4.csv,0.832888,0.811339,0.745193,0.813984,0.771053,0.808338,0.828386,1.0
51_0.4.csv,0.838864,0.832437,0.764632,0.819265,0.778887,0.811638,0.832565,1.0


In [41]:
result_df['corr_avg'] = result_df.drop('학령인구(명)', axis=1).mean(axis=1)

In [42]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 288 entries, 76_0.3.csv to 11_0.3.csv
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   총인구수(명)   288 non-null    float64
 1   유치원 수     288 non-null    float64
 2   초등학교 수    288 non-null    float64
 3   출생건수      288 non-null    float64
 4   사망건수      288 non-null    float64
 5   혼인건수      288 non-null    float64
 6   이혼건수      288 non-null    float64
 7   학령인구(명)   288 non-null    float64
 8   corr_avg  288 non-null    float64
dtypes: float64(9)
memory usage: 30.6+ KB


In [44]:
result_df.head()

Unnamed: 0,총인구수(명),유치원 수,초등학교 수,출생건수,사망건수,혼인건수,이혼건수,학령인구(명),corr_avg
76_0.3.csv,0.784826,0.80594,0.731358,0.763676,0.726729,0.755446,0.781019,1.0,0.764142
66_0.3.csv,0.785396,0.80299,0.725554,0.765101,0.727829,0.757295,0.782574,1.0,0.76382
6_0.6.csv,0.925205,0.851072,0.805365,0.911414,0.837669,0.913185,0.905101,1.0,0.87843
41_0.4.csv,0.832888,0.811339,0.745193,0.813984,0.771053,0.808338,0.828386,1.0,0.801597
51_0.4.csv,0.838864,0.832437,0.764632,0.819265,0.778887,0.811638,0.832565,1.0,0.811184


In [45]:
# 최대 corr_avg와 최소 corr_avg가 나오는 행을 구함

result_df['corr_avg'].idxmax()

'1_0.8.csv'

In [46]:
result_df['corr_avg'].idxmin()

'31_0.1.csv'

In [47]:
result_df.loc['1_0.8.csv']

총인구수(명)     0.954494
유치원 수       0.885702
초등학교 수      0.847189
출생건수        0.942204
사망건수        0.867748
혼인건수        0.941170
이혼건수        0.937451
학령인구(명)     1.000000
corr_avg    0.910851
Name: 1_0.8.csv, dtype: float64

In [48]:
rawDF = pd.read_csv('./makeDataset/dataset/no_NaN_dataset_final.csv')

In [51]:
rawDF.drop(['행정구역(시도)','행정구역(시군구)','연도', '행정구역', '총인구수-학령인구(명)'], axis=1).corr()['학령인구(명)']

총인구수(명)    0.979178
유치원 수      0.905859
초등학교 수     0.864143
출생건수       0.963695
사망건수       0.890953
혼인건수       0.958059
이혼건수       0.966135
학령인구(명)    1.000000
Name: 학령인구(명), dtype: float64