## 구글 코랩 사용시 구글 드라이브 연결 사용

In [1]:
#구글 드라이브 연결
from google.colab import drive
drive.mount('/content/gdrive')

#코랩 환경 경로 설정 -> 자신에게 맞는 경로로 설정해주시면 됩니다 
DATA_PATH = '/content/gdrive/MyDrive/미세먼지예측공모전/dataset/'

Mounted at /content/gdrive


## Import & Install

In [2]:
# 코랩 기준 필요 라이브러리 설치

!pip install haversine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting haversine
  Downloading haversine-2.8.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.8.0


In [3]:
#Base & visualization
import os
import glob
import random
import pandas as pd
import numpy as np
import warnings
import matplotlib.pylab as plt
import seaborn as sns

#Feature engineering
import datetime
from haversine import haversine

## Fix Seed

In [4]:
#Seed 고정
class CFG:
    SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## Fix Seed

In [5]:
# 각 폴더 경로 설정
folders = [DATA_PATH + "/TRAIN/", DATA_PATH + "/TRAIN_AWS/", DATA_PATH + "/TEST_INPUT/", DATA_PATH + "/TEST_AWS/"]

# CSV 파일들을 담을 빈 리스트 생성
dfs = []

# 각 폴더 내의 모든 CSV 파일을 읽어서 리스트에 추가하기
for folder in folders:
    csv_files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.csv')]
    dfs.append(pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True))

# 리스트에 있는 모든 데이터프레임을 하나로 합치기
train_df, train_aws_df, test_input_df, test_aws_df = dfs

In [6]:
awsmap = pd.read_csv(DATA_PATH + '/META/awsmap.csv')
pmmap = pd.read_csv(DATA_PATH + '/META/pmmap.csv')

In [7]:
submit = pd.read_csv(DATA_PATH + 'answer_sample.csv')

## Train

In [8]:
# answer_sample과 같은 형식으로 정렬
train_df = train_df.sort_values(['측정소', '연도', '일시'], ascending=[True, True, True]).reset_index(drop=True)

In [9]:
# train_aws_df에서 지점별로 선형 보간법 적용
train_aws_df = train_aws_df.groupby(['지점'], group_keys=False).apply(lambda x: x.interpolate(method='linear'))

In [10]:
# train_df에서 지점별로 선형 보간법 적용
train_df = train_df.groupby(['연도', '측정소'], group_keys=False).apply(lambda x: x.interpolate(method='linear'))

In [11]:
# 보간법으로 미처리된 결측값 뒷 값으로 채우기
train_df['PM2.5'] = train_df['PM2.5'].fillna(method = 'bfill')

In [12]:
# 좌표값 추가
train_df = pd.merge(train_df, pmmap.iloc[:,:-1], left_on='측정소', right_on='Location', how='left')
train_aws_df = pd.merge(train_aws_df, awsmap.iloc[:,:-1], left_on='지점', right_on='Location', how='left')

In [13]:
# haversine 패키지를 이용하여 위도,경도 거리가 가장 가까운 aws관측소 찾기
train_df['지점'] = train_df.apply(lambda x: awsmap.loc[np.argmin([haversine((x.Latitude, x.Longitude), (loc.Latitude, loc.Longitude), unit='km') for i, loc in awsmap.iterrows()]), 'Location'], axis=1)

In [14]:
train = pd.merge(train_df, train_aws_df[['연도', '일시', '지점', '기온(°C)', '풍향(deg)', '풍속(m/s)', '강수량(mm)', '습도(%)']], 
                     on=['연도', '일시', '지점'], 
                     how='left')

In [15]:
# Location, Latitude, Longitude, 지점 컬럼 삭제
train = train.drop(['Location', 'Latitude', 'Longitude', '지점'], axis=1)

# 컬럼 순서 재조정 및 PM2.5 컬럼을 맨 뒤로 이동
train = train.reindex(columns=['연도', '일시', '측정소', '기온(°C)', '풍향(deg)', '풍속(m/s)', '강수량(mm)', '습도(%)', 'PM2.5'])

In [16]:
train

Unnamed: 0,연도,일시,측정소,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%),PM2.5
0,0,01-01 00:00,공주,0.173776,0.201944,0.023018,0.0,0.828,0.056
1,0,01-01 01:00,공주,0.176935,0.168611,0.030691,0.0,0.831,0.060
2,0,01-01 02:00,공주,0.180095,0.087222,0.033248,0.0,0.784,0.068
3,0,01-01 03:00,공주,0.178515,0.087222,0.025575,0.0,0.745,0.060
4,0,01-01 04:00,공주,0.164297,0.113889,0.020460,0.0,0.750,0.068
...,...,...,...,...,...,...,...,...,...
596083,3,12-31 19:00,홍성읍,0.273302,0.832222,0.086957,0.0,0.671,0.060
596084,3,12-31 20:00,홍성읍,0.271722,0.831667,0.043478,0.0,0.692,0.052
596085,3,12-31 21:00,홍성읍,0.268562,0.832500,0.066496,0.0,0.706,0.044
596086,3,12-31 22:00,홍성읍,0.262243,0.866944,0.043478,0.0,0.725,0.052


## Test

In [17]:
# answer_sample과 같은 형식으로 정렬
test_input_df = test_input_df.sort_values(['측정소', '연도', '일시'], ascending=[True, True, True]).reset_index(drop=True)

In [18]:
# 11-16 23:00 이후 데이터 삭제
test_input_df = test_input_df[test_input_df['일시'] <= '11-16 23:00'].reset_index(drop=True)

In [19]:
# 좌표값 추가
test_input_df = pd.merge(test_input_df, pmmap.iloc[:,:-1], left_on='측정소', right_on='Location', how='left')
test_aws_df = pd.merge(test_aws_df, awsmap.iloc[:,:-1], left_on='지점', right_on='Location', how='left')

In [20]:
# haversine 패키지를 이용하여 위도,경도 거리가 가장 가까운 aws관측소 찾기
test_input_df['지점'] = test_input_df.apply(lambda x: awsmap.loc[np.argmin([haversine((x.Latitude, x.Longitude), (loc.Latitude, loc.Longitude), unit='km') for i, loc in awsmap.iterrows()]), 'Location'], axis=1)

In [21]:
test = pd.merge(test_input_df, test_aws_df[['연도', '일시', '지점', '기온(°C)', '풍향(deg)', '풍속(m/s)', '강수량(mm)', '습도(%)']], 
                     on=['연도', '일시', '지점'], 
                     how='left')

In [22]:
# Location, Latitude, Longitude, 지점 컬럼 삭제
test = test.drop(['Location', 'Latitude', 'Longitude', '지점'], axis=1)

# 컬럼 순서 재조정 및 PM2.5 컬럼을 맨 뒤로 이동
test = test.reindex(columns=['연도', '일시', '측정소', '기온(°C)', '풍향(deg)', '풍속(m/s)', '강수량(mm)', '습도(%)', 'PM2.5'])

In [23]:
test

Unnamed: 0,연도,일시,측정소,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%),PM2.5
0,4,01-01 00:00,공주,0.244866,0.123333,0.038363,0.0,0.647,0.060
1,4,01-01 01:00,공주,0.232227,0.167778,0.033248,0.0,0.648,0.064
2,4,01-01 02:00,공주,0.206951,0.000000,0.002558,0.0,0.734,0.072
3,4,01-01 03:00,공주,0.199052,0.000000,0.002558,0.0,0.753,0.064
4,4,01-01 04:00,공주,0.189573,0.000000,0.002558,0.0,0.795,0.056
...,...,...,...,...,...,...,...,...,...
130555,4,11-16 19:00,홍성읍,,,,,,
130556,4,11-16 20:00,홍성읍,,,,,,
130557,4,11-16 21:00,홍성읍,,,,,,
130558,4,11-16 22:00,홍성읍,,,,,,


In [24]:
train.to_csv(DATA_PATH + 'train.csv', encoding='utf-8', index=False)
test.to_csv(DATA_PATH + 'test.csv', encoding='utf-8', index=False)

In [25]:
submit

Unnamed: 0,연도,일시,측정소,PM2.5
0,4,01-03 00:00,공주,
1,4,01-03 01:00,공주,
2,4,01-03 02:00,공주,
3,4,01-03 03:00,공주,
4,4,01-03 04:00,공주,
...,...,...,...,...
78331,4,11-16 19:00,홍성읍,
78332,4,11-16 20:00,홍성읍,
78333,4,11-16 21:00,홍성읍,
78334,4,11-16 22:00,홍성읍,
