In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('./data/rainfall_train.csv')
test = pd.read_csv('./data/rainfall_test.csv')

## 1. Unnamed: 0 열 제거

In [3]:
train.drop(['Unnamed: 0'], axis = 1, inplace = True)
test.drop(['Unnamed: 0'], axis = 1, inplace = True)

## 2. 열 이름에서 'rainfall_train.',  'rainfall_test.'접두사 제거

In [4]:
def remove_prefix(df, prefix):
    df.columns = df.columns.str.replace(f'^{prefix}', '', regex=True)
    return df

# 'rainfall_train.' 접두사 제거
train = remove_prefix(train, 'rainfall_train.')
test = remove_prefix(test, 'rainfall_test.')

## 3. train 에서 실강수량 -999인 행 삭제

- 8490개 행은 삭제

In [5]:
train[train['vv'] == -999].shape

(8490, 21)

In [6]:
train.drop(train[train['vv'] == -999].index, inplace = True)

## 4. 연도 매핑 및 발표 시각, 예측 시각 datetime 형식 변환

In [7]:
# 연도 변환을 위한 매핑
year_mapping = {'A': 2021, 'B': 2022, 'C': 2023, 'D': 2024}

In [8]:
# 발표 시각과 예측 시각을 datetime 형식으로 변환
train['fc_year'] = train['fc_year'].map(year_mapping)
train['ef_year'] = train['ef_year'].map(year_mapping)

train['forecast_time'] = pd.to_datetime(train[['fc_year', 'fc_month', 'fc_day', 'fc_hour']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d-%H')
train['prediction_time'] = pd.to_datetime(train[['ef_year', 'ef_month', 'ef_day', 'ef_hour']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d-%H')

In [9]:
# 발표 시각과 예측 시각을 datetime 형식으로 변환
test['fc_year'] = test['fc_year'].map(year_mapping)
test['ef_year'] = test['ef_year'].map(year_mapping)

test['forecast_time'] = pd.to_datetime(test[['fc_year', 'fc_month', 'fc_day', 'fc_hour']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d-%H')
test['prediction_time'] = pd.to_datetime(test[['ef_year', 'ef_month', 'ef_day', 'ef_hour']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d-%H')

## 5. 개별 예측 확률 변수 추가

In [10]:
train['v00_ind'] = 100 - train['v01']
train['v01_ind'] = train['v01'] - train['v02']
train['v02_ind'] = train['v02'] - train['v03']
train['v03_ind'] = train['v03'] - train['v04']
train['v04_ind'] = train['v04'] - train['v05']
train['v05_ind'] = train['v05'] - train['v06']
train['v06_ind'] = train['v06'] - train['v07']
train['v07_ind'] = train['v07'] - train['v08']
train['v08_ind'] = train['v08'] - train['v09']
train['v09_ind'] = train['v09']

In [11]:
test['v00_ind'] = 100 - test['v01']
test['v01_ind'] = test['v01'] - test['v02']
test['v02_ind'] = test['v02'] - test['v03']
test['v03_ind'] = test['v03'] - test['v04']
test['v04_ind'] = test['v04'] - test['v05']
test['v05_ind'] = test['v05'] - test['v06']
test['v06_ind'] = test['v06'] - test['v07']
test['v07_ind'] = test['v07'] - test['v08']
test['v08_ind'] = test['v08'] - test['v09']
test['v09_ind'] = test['v09']

## 6. 개별 예측 확률을 이용한 실강수량 기댓값 변수 추가

In [12]:
train['v_expect'] = (0.05*train['v00_ind'] + 0.15*train['v01_ind'] + 0.35*train['v02_ind'] + 0.75*train['v03_ind'] 
    + 1.5*train['v04_ind'] + 3.5*train['v05_ind'] + 7.5*train['v06_ind'] + 15*train['v07_ind']
                       + 25*train['v08_ind']+ 30*train['v09_ind'])*0.01

In [13]:
test['v_expect'] = (0.05*test['v00_ind'] + 0.15*test['v01_ind'] + 0.35*test['v02_ind'] + 0.75*test['v03_ind'] 
    + 1.5*test['v04_ind'] + 3.5*test['v05_ind'] + 7.5*test['v06_ind'] + 15*test['v07_ind']
                       + 25*test['v08_ind']+ 30*test['v09_ind'])*0.01

## 7. basis_index 추가

- pk 생성(추후 시계열 모델 생성 시 정렬에 사용 예정)

In [14]:
train['basis_index'] = train['prediction_time'].astype(str) + "_" + train['stn4contest']
test['basis_index'] = test['prediction_time'].astype(str) + "_" + test['stn4contest']

### Data Save

In [19]:
train.to_csv('train_pre.csv', index=False, encoding='UTF-8-sig')
test.to_csv('test_pre.csv', index=False, encoding='UTF-8-sig')