In [11]:
import pandas as pd

df = pd.read_csv('finedust_basic_data.csv')
print (df)

               lat         lon  year  month  day  hour  week    no2     o3  \
0        37.571625  127.042142  2013      1    1    22     1  0.044  0.001   
1        37.599969  126.931242  2013      1    1    22     1  0.052  0.002   
2        37.449108  126.904197  2013      1    1    22     1  0.035  0.012   
3        37.560611  127.039000  2013      1    1    22     1  0.046  0.026   
4        37.527367  127.125864  2013      1    1    22     1  0.043  0.001   
...            ...         ...   ...    ...  ...   ...   ...    ...    ...   
2638896  37.603806  127.094778  2014      1    1     0     2  0.007  0.043   
2638897  37.509656  126.941575  2014      1    1     0     2  0.007  0.044   
2638898  37.523611  126.898342  2014      1    1     0     2  0.006  0.038   
2638899  37.492650  126.889597  2014      1    1     0     2  0.005  0.030   
2638900  37.560706  126.910531  2014      1    1     0     2  0.007  0.040   

          co  ...    hm    rn  sd_tot    ca_tot    ca_mid      

In [9]:
import pandas as pd

df = pd.read_csv('filtered_weather_yearly_2012.csv')

# 총데이터 개수 8633
df_null = (df == -9.0).sum()
print (df_null)

YYMMDDHHMI       0
WD               0
WS               0
TA              13
TD              14
HM               0
RN            7762
SD_TOT        7965
CA_TOT        1894
CA_MID        1894
VS            1894
TS              13
SI            3910
PS               0
PA               0
dtype: int64


In [7]:
# 기상청 날씨데이터 전처리 - kma_weather_ready.csv 파일 생성

import pandas as pd
import glob
import numpy as np

file_path = 'filtered_weather_yearly_*.csv'

csv_files = glob.glob(file_path)
df_list = [pd.read_csv(file) for file in csv_files]
df_all = pd.concat(df_list, ignore_index=True)

df_all['YYMMDDHHMI'] = pd.to_datetime(df_all['YYMMDDHHMI'], format='%Y%m%d%H%M')
df_all = df_all.sort_values(by='YYMMDDHHMI', ascending=True)

# 년월일시 요일 열 추가
df_all['year'] = df_all['YYMMDDHHMI'].dt.year
df_all['month'] = df_all['YYMMDDHHMI'].dt.month
df_all['day'] = df_all['YYMMDDHHMI'].dt.day
df_all['hour'] = df_all['YYMMDDHHMI'].dt.hour
df_all['weekday'] = df_all['YYMMDDHHMI'].dt.weekday  # 월요일=0, 일요일=6

column_order = [
    'year', 'month', 'day', 'hour', 'weekday',
    'WD','WS','TA','TD','HM','RN','SD_TOT',
    'CA_TOT','CA_MID','VS','TS','SI','PS','PA'
]

df_all = df_all[column_order]

# df_all.to_csv('weather_data_all.csv', index=False) # df_all 파일 필요시 저장

weather_mean = ['WD','WS','HM','CA_MID','VS','PS','PA']
mean_values = df_all[weather_mean].apply(lambda x: x[x != -9.0].mean()) 
for col in weather_mean:
    df_all[col] = df_all[col].replace(-9.0, mean_values[col])

weather_zero = ['RN','SD_TOT']
for col in weather_zero:  
    df_all[col] = df_all[col].replace(-9.0, 0)

# CA_TOT 결측치 처리 (해당 월 평균값)
def fill_ca_tot(row):
    if row['CA_TOT'] == -9.0:
        mean_ca_tot = df_all[df_all['month'] == row['month']]['CA_TOT'].replace(-9.0, np.nan).mean()
        return mean_ca_tot
    else:
        return row['CA_TOT']

df_all['CA_TOT'] = df_all.apply(fill_ca_tot, axis=1)

def fill_si(row):
    # 하절기: 4월 ~ 9월, 동절기: 10월 ~ 3월
    summer = 4 <= row['month'] <= 9
    winter = not summer

    # 결측치 SI 처리
    if row['SI'] == -9.0:
        # 하절기: 6시~21시 결측치는 평균값으로 대체
        if summer and 6 <= row['hour'] < 21:
            mean_si = df_all[(df_all['hour'] >= 6) & (df_all['hour'] < 21) & (df_all['month'] == row['month'])]['SI'].replace(-9.0, np.nan).mean()
            return mean_si
        
        # 동절기: 8시~19시 결측치는 평균값으로 대체
        if winter and 8 <= row['hour'] < 19:
            mean_si = df_all[(df_all['hour'] >= 8) & (df_all['hour'] < 19) & (df_all['month'] == row['month'])]['SI'].replace(-9.0, np.nan).mean()
            return mean_si
        
        # 나머지 밤 시간대 결측치는 0으로 대체
        else:
            return 0
    
    # 결측치가 아니면 원래 값 유지
    return row['SI']

df_all['SI'] = df_all.apply(fill_si, axis=1)

df_all.columns = df_all.columns.str.lower()

# df_null = (df_all == -9.0).sum()
# print (df_null)
    
df_all.to_csv('kma_weather_ready.csv', index=False)

print(df_all)

        year  month  day  hour  weekday    wd   ws   ta    td    hm   rn  \
6460    2012      1    1    23        6  32.0  2.7 -6.6 -11.5  68.0  0.0   
6461    2012      1    2     0        0  29.0  2.5 -6.8 -11.7  68.0  0.0   
6462    2012      1    2     1        0  29.0  1.0 -7.0 -11.3  71.0  0.0   
6463    2012      1    2     2        0  27.0  2.6 -7.3 -11.4  72.0  0.0   
6464    2012      1    2     3        0  27.0  2.4 -7.2 -11.0  74.0  0.0   
...      ...    ...  ...   ...      ...   ...  ...  ...   ...   ...  ...   
111126  2024     12   31    19        1  27.0  1.9  0.5  -9.8  46.0  0.0   
111127  2024     12   31    20        1  25.0  2.4  0.0  -9.7  48.0  0.0   
111128  2024     12   31    21        1  27.0  1.0 -0.7  -9.6  51.0  0.0   
111129  2024     12   31    22        1  32.0  0.7 -1.0  -9.3  53.0  0.0   
111130  2024     12   31    23        1  25.0  1.3 -1.4  -9.0  56.0  0.0   

        sd_tot    ca_tot    ca_mid           vs   ts   si      ps      pa  
6460       