## 1. Library Import & Set Random State

In [1]:
import random
import os
import warnings

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import datetime

import time
import holidays
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import pyproj
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.ensemble import  VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

mpl.rc('font', family = 'Gulim')
mpl.rcParams['axes.unicode_minus']=False
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

## 2. Data Pre-processing
### (1) Data Load and change column names

In [2]:
train = pd.read_csv("./data/electric_train.csv", index_col=0)
test = pd.read_csv("./data/electric_test.csv", index_col=0)
# validation dataset in Web site
sub = pd.read_csv("./data/electric_test2.csv")

# change column names in train data
train_col = list(train.columns)
train_col = [col_name.split(".")[1] for col_name in train_col]
train.columns = train_col

# change column names in test data
test_col = list(test.columns)
test_col = [col_name.split(".")[1] for col_name in test_col]
test.columns = test_col

### (2) Making Derived Varibles 1 related with time

In [3]:
# split 'tm' => "year", "month", "day", "date"
train['tm'] = pd.to_datetime(train['tm'])
train['year'] = train['tm'].dt.year
train['month'] = train['tm'].dt.month
train['day'] = train['tm'].dt.day
train['date'] = pd.to_datetime(train[['year', 'month', 'day']])

test['tm'] = pd.to_datetime(test['tm'])
test['year'] = test['tm'].dt.year
test['month'] = test['tm'].dt.month
test['day'] = test['tm'].dt.day
test['date'] = pd.to_datetime(test[['year', 'month', 'day']])

# add 'day_of_year'
def day_of_year(row):
    date = datetime.date(row['year'], row['month'], row['day'])
    start_of_year = datetime.date(row['year'], 1, 1)
    day_number = (date - start_of_year).days + 1
    return day_number

train['day_of_year'] = train.apply(day_of_year, axis=1)
test['day_of_year'] = test.apply(day_of_year, axis=1)

# add "rest_day" => weekend + holidays
korean_holidays = holidays.KR(years=[2020, 2021, 2022, 2023])
holiday_list = pd.to_datetime(list(korean_holidays.keys()))

def is_weekend(date):
    if date.weekday() >= 5:
        return 1
    else:
        return 0

train['rest_day'] = train.loc[:, 'date'].isin(holiday_list) | train.loc[:, 'date'].apply(is_weekend)
test['rest_day'] = test.loc[:, 'date'].isin(holiday_list) | test.loc[:, 'date'].apply(is_weekend)

# drop "week_name"
train.drop(columns='week_name', inplace=True)
test.drop(columns='week_name', inplace=True)

### (3) Making Derived Variables 2 for creating x, y grid

In [4]:
# (1,1) ~ (149, 1) , (1,1) ~ (1,253)
train['grid_x'] = train['num'] % 149 + 1
train['grid_y'] = train['num'] // 149 + 1

test['grid_x'] = test['num'] % 149 + 1
test['grid_y'] = test['num'] // 149 + 1

### (4) Making Derived Variables 3 for checking Metapolitan City/Province

In [5]:
def grid_to_location(x, y):
    # Define constants
    earth_radius = 6371.00877  # km

    # Define the LCC projection parameters
    true_lat1 = 30
    true_lat2 = 60
    ref_lat = 38
    ref_lon = 126
    grid_spacing = 5  # km
    x0 = 43
    y0 = 136

    # Define the LCC projection
    proj_lcc = pyproj.Proj(proj='lcc', lat_1=true_lat1, lat_2=true_lat2, lat_0=ref_lat, lon_0=ref_lon, R=earth_radius)

    # Calculate map coordinates (x, y in km)
    map_x = (x - x0) * grid_spacing
    map_y = (y - y0) * grid_spacing

    # Convert map coordinates to geographic coordinates
    lon, lat = proj_lcc(map_x, map_y, inverse=True)
    return lat, lon

def get_address(latitude, longitude, retries=5):
    geolocator = Nominatim(user_agent="South Korea")
    for i in range(retries):
        try:
            location = geolocator.reverse((latitude, longitude), exactly_one=True, timeout=10)
            if (address := location.raw['address']) == None:
                continue
            city = address.get('city', '') or address.get('county', '')
            province = address.get('province', '')
            return city, province
        except GeocoderTimedOut:
            if i < retries - 1:
                print(f"Timeout occurred. Retrying... ({i + 1}/{retries})")
                time.sleep(2)
            else:
                print("Geocoder service unavailable after multiple retries.")
                return None, None

# distinguish metapolitan city            
metro_city = ['서울특별시', '부산광역시', '대구광역시', '대전', '광주광역시', '울산광역시', '인천광역시', '세종특별자치시']
location_dict = {}

train_grid = list(set(zip(train['grid_x'], train['grid_y'])))
test_grid = list(set(zip(test['grid_x'], test['grid_y'])))

def location_name(grid_list):
    for cor in tqdm(grid_list):
        if cor == (2,17):
            cor_city = np.nan
            cor_province = np.nan
        else:       
            cor_x = cor[0]
            cor_y = cor[1]
            
            cor_lat, cor_lon = grid_to_location(cor_x, cor_y)
            cor_city, cor_province = get_address(cor_lat, cor_lon)
            
            if cor_city in metro_city:
                location = cor_city
                if cor_city == '대전': location = '대전광역시'
            elif cor_province:
                location = cor_province
            else:
                location = '경기도'
            location_dict[(cor_x, cor_y)] = location

location_name(train_grid)
location_name(test_grid)

def get_location_from_coordinates(row):
    coordinates = (row['grid_x'], row['grid_y'])
    return location_dict.get(coordinates, 'Unknown')

train['location'] = train.apply(get_location_from_coordinates, axis=1)
test['location'] = test.apply(get_location_from_coordinates, axis=1)

Counter(location_dict.values())

100%|██████████| 323/323 [07:24<00:00,  1.38s/it]
100%|██████████| 323/323 [07:27<00:00,  1.39s/it]


Counter({'경기도': 105,
         '경상북도': 30,
         '경상남도': 29,
         '부산광역시': 24,
         '서울특별시': 17,
         '전라남도': 17,
         '충청남도': 16,
         '대구광역시': 16,
         '충청북도': 15,
         '전북특별자치도': 13,
         '강원특별자치도': 13,
         '대전광역시': 12,
         '인천광역시': 11,
         '울산광역시': 9,
         '광주광역시': 8,
         '제주특별자치도': 5,
         '세종특별자치시': 3})

### (5) Labeling using the previous resident registration number area code

In [6]:
# Labeling using the previous resident registration number area code
location_num = {
    # 8 Province
    '경기도': 16,
    '강원특별자치도': 26,
    '충청남도': 41,
    '충청북도': 35,
    '전라남도': 55,
    '전북특별자치도': 48,
    '경상북도': 70,
    '경상남도': 82,
    '제주특별자치도': 93,
    # 8 Metrapolican city
    '서울특별시': 0,
    '인천광역시': 14,
    '세종특별자치시': 96,   
    '대전광역시': 40,
    '광주광역시': 65,
    '대구광역시': 67,
    '울산광역시': 85,
    '부산광역시': 9
    }

def get_region_number(location):
    return location_num.get(location, -1)

train['location_num'] = train['location'].apply(get_region_number)
test['location_num'] = test['location'].apply(get_region_number)

train.drop(columns='location', inplace=True)
test.drop(columns='location', inplace=True)

### (6) Making Making Derived Variables 4 related with Weather

In [7]:
# add ta_diff => calculate actual temperature - wind chill temperature
train['ta_diff'] = train['nph_ta'] - train['nph_ta_chi']
test['ta_diff'] = test['nph_ta'] - test['nph_ta_chi']

# add discomfort index
train["discomfort"] = (9 / 5) * train["nph_ta"] - 0.55 * (1 - (train["nph_hm"] / 100)) * ((9 / 5) * train["nph_ta"] - 26) + 32
test["discomfort"] = (9 / 5) * test["nph_ta"] - 0.55 * (1 - (test["nph_hm"] / 100)) * ((9 / 5) * test["nph_ta"] - 26) + 32

# generate the highest and lowest temparature variables for each grid
train_daily_temps = train.groupby(['num', 'date'])['nph_ta'].agg(['max', 'min']).reset_index()
train_daily_temps.columns = ['num', 'date', 'max_temp', 'min_temp']
train_daily_temps['date'] = pd.to_datetime(train_daily_temps['date'])
train['date'] = pd.to_datetime(train['date'])
train = pd.merge(train, train_daily_temps, on=['num', 'date'], how='left')

test_daily_temps = test.groupby(['num', 'date'])['nph_ta'].agg(['max', 'min']).reset_index()
test_daily_temps.columns = ['num', 'date', 'max_temp', 'min_temp']
test_daily_temps['date'] = pd.to_datetime(test_daily_temps['date'])
test['date'] = pd.to_datetime(test['date'])
test = pd.merge(test, test_daily_temps, on=['num', 'date'], how='left')

# add Cooling degree hours, Heating degree hours
def cdh(row):
    return max(0, row['nph_ta']-26)

def hdh(row):
    return max(0, 18-row['nph_ta'])

train['CDH_26'] = train.apply(cdh, axis=1)
train['HDH_18'] = train.apply(hdh, axis=1)

test['CDH_26'] = test.apply(cdh, axis=1)
test['HDH_18'] = test.apply(hdh, axis=1)

### (7) check and fill missing values

In [8]:
train_ws_nan = train[train['nph_ws_10m']==-99] 
train_elec_nan = train[train['elec']==-99]['num'] # only five rows have missing value is elec

test_ws_nan = test[test['nph_ws_10m']==-99]

train.replace(-99.0, np.nan, inplace=True)
test.replace(-99.0, np.nan, inplace=True)

# fill missing value using linear interpolation
train['nph_ws_10m'] = train['nph_ws_10m'].interpolate(method='linear')
test['nph_ws_10m'] = test['nph_ws_10m'].interpolate(method='linear')

train['elec'] = train['elec'].interpolate(method='linear')

train = train[['num', 'grid_x', 'grid_y', 'location_num', 'tm', 'date', 'year', 'month', 'day', 'day_of_year', 'weekday', 'hh24', 'rest_day', 'n', 'stn', 'sum_qctr', 'sum_load', 'n_mean_load', 'nph_ta', 'nph_hm', 'nph_ws_10m', 'nph_rn_60m', 'nph_ta_chi', 'ta_diff', 'max_temp', 'min_temp', 'discomfort', 'CDH_26', 'HDH_18', 'elec']]
test = test[['num', 'grid_x', 'grid_y', 'location_num', 'tm', 'date', 'year', 'month', 'day', 'day_of_year', 'weekday', 'hh24', 'rest_day', 'stn', 'nph_ta', 'nph_hm', 'nph_ws_10m', 'nph_rn_60m', 'nph_ta_chi', 'ta_diff', 'max_temp', 'min_temp', 'discomfort', 'CDH_26', 'HDH_18']]

train.shape, test.shape

((7593355, 30), (2829478, 25))

## 3. Modeling

In [9]:
train.drop(columns=['tm', 'date', 'day', 'n','sum_qctr', 'sum_load', 'n_mean_load'], inplace=True)
test.drop(columns=['tm', 'date', 'day'], inplace=True)

X = train.drop("elec", axis=1)
y = train[["elec"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xg = XGBRegressor(random_state=42)
lg = LGBMRegressor(random_state=42, verbose=-1)
cat = CatBoostRegressor(random_state=42, silent=True, cat_features=[0, 3, 7, 9, 10])

model = VotingRegressor(estimators=[
    ("xg", xg),
    ("lg", lg),
    ("cat", cat)
])

model.fit(X_train, y_train)

## 4. Make file for Submission

In [10]:
pred_voting = model.predict(test)
sub["elect"] = pred_voting
sub.to_csv("./submission/240248.csv", index=False, encoding="utf-8")