In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
import time as timer
import argparse
import datetime
import json
from pyspark.sql.functions import col, abs, mean, expr, substring, udf
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

import random
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
#import lightgbm as lgb

schema = StructType(
    [
        StructField("num_date_time", StringType()),
        StructField("건물번호", StringType()),
        StructField("일시", StringType()),
        StructField("기온(C)", StringType()),
        StructField("강수량(mm)", StringType()),
        StructField("풍속(m/s)", StringType()),
        StructField("습도(%)", StringType()),
        StructField("일조(hr)", StringType()),
        StructField("일사(MJ/m2)", StringType()),
        StructField("전력소비량(kWh)", StringType()),
    ]
)

schema2 = StructType(
    [
        StructField("num_date_time", StringType()),
        StructField("건물번호", StringType()),
        StructField("일시", StringType()),
        StructField("기온(C)", StringType()),
        StructField("강수량(mm)", StringType()),
        StructField("풍속(m/s)", StringType()),
        StructField("습도(%)", StringType()),
    ]
)

schema3 = StructType(
    [
        StructField("건물번호", StringType()),
        StructField("건물유형", StringType()),
        StructField("연면적(m2)", StringType()),
        StructField("냉방면적(m2)", StringType()),
        StructField("태양광용량(kW)", StringType()),
        StructField("ESS저장용량(kWh)", StringType()),
        StructField("PCS용량(kW)", StringType()),
    ]
)



print("FILES IN THIS DIRECTORY")
print(os.listdir(os.getcwd()))

FILES IN THIS DIRECTORY
['.bashrc', '.bash_logout', '.profile', 'Untitled.ipynb', 'confluentinc-kafka-connect-mqtt-1.7.2.zip', '.ipython', '.cache', '.npm', '.bash_history', '.local', '.ipynb_checkpoints', 'config.json', '.ivy2', '.jupyter', 'jars', '.conda', '.config', '.wget-hsts', 'work']


## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
# config.json 파일 읽기
with open("config.json", "r") as f:
    config = json.load(f)

jar_urls = ",".join(config["KAFKA_JAR_URLS"])
repartition_num = config["NUM_EXECUTORS"] * config["EXECUTOR_CORES"] * 2

In [4]:
# SparkSession 생성
spark = (
    SparkSession.builder.master("spark://spark-master-service:7077")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.driver.host", "10.42.2.146")
    .config("spark.driver.port", "39337")
    .config("spark.cores.max", "24")
    .config("spark.executor.instances", config["NUM_EXECUTORS"])
    .config("spark.executor.cores", config["EXECUTOR_CORES"])
    .config("spark.executor.memory", config["EXECUTOR_MEMORY"])
    .config("spark.defaul.parallelism", repartition_num)
    .config("spark.sql.shuffle.partitions", repartition_num)
    .config("spark.jars", jar_urls)  # JAR 파일 포함
    .appName("asdf")
    .getOrCreate()
)

sc = spark.sparkContext
sc.setLogLevel("ERROR")

24/05/21 08:56:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/21 08:56:18 WARN SparkConf: Total executor cores: 24 is not divisible by cores per executor: 16, the left cores: 8 will not be allocated


In [5]:
sc

In [6]:
print("Current Spark configuration:")
for key, value in sorted(sc._conf.getAll(), key=lambda x: x[0]):
    print(f"{key} = {value}")

Current Spark configuration:
spark.app.id = app-20240521085619-0391
spark.app.initial.jar.urls = spark://10.42.2.146:39337/jars/spark-streaming-kafka-0-10_2.12-3.2.4.jar,spark://10.42.2.146:39337/jars/hadoop-client-runtime-3.3.1.jar,spark://10.42.2.146:39337/jars/jsr305-3.0.0.jar,spark://10.42.2.146:39337/jars/htrace-core4-4.1.0-incubating.jar,spark://10.42.2.146:39337/jars/commons-logging-1.1.3.jar,spark://10.42.2.146:39337/jars/kafka-clients-2.8.1.jar,spark://10.42.2.146:39337/jars/spark-token-provider-kafka-0-10_2.12-3.2.4.jar,spark://10.42.2.146:39337/jars/commons-pool2-2.6.2.jar,spark://10.42.2.146:39337/jars/hadoop-client-api-3.3.1.jar,spark://10.42.2.146:39337/jars/spark-sql-kafka-0-10_2.12-3.2.4.jar
spark.app.name = asdf
spark.app.startTime = 1716281778650
spark.cores.max = 24
spark.defaul.parallelism = 96
spark.driver.bindAddress = 0.0.0.0
spark.driver.host = 10.42.2.146
spark.driver.port = 39337
spark.executor.cores = 16
spark.executor.id = driver
spark.executor.instances = 3

In [7]:
# 그냥 가져오기
building_sdf = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", "my-cluster-kafka-bootstrap.kafka.svc:9092")
    .option("subscribe", "building-jy")
    .option("kafka.group.id", "my_consumer_group")
    .load()
)  # 밀리초 단위 에포치 시간endingTimestamp
building_sdf = building_sdf.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS STRING)")
building_sdf = building_sdf.withColumnRenamed("timestamp", "createTime")
building_sdf = building_sdf.withColumn("value", from_json(building_sdf["value"], schema3))


for field in schema3.fields:
    building_sdf = building_sdf.withColumn(field.name, building_sdf["value." + field.name])
building_sdf = building_sdf.drop("value")
# 이거쓰면 df가 repartition_num 수만큼 쪼개져서 병렬처리가능한 상태가 됨.
building_sdf = building_sdf.repartition(repartition_num)


building_sdf.printSchema()

root
 |-- createTime: string (nullable = true)
 |-- 건물번호: string (nullable = true)
 |-- 건물유형: string (nullable = true)
 |-- 연면적(m2): string (nullable = true)
 |-- 냉방면적(m2): string (nullable = true)
 |-- 태양광용량(kW): string (nullable = true)
 |-- ESS저장용량(kWh): string (nullable = true)
 |-- PCS용량(kW): string (nullable = true)



In [8]:
building_sdf = building_sdf.select(
    building_sdf["createTime"],
    building_sdf["건물번호"],
    building_sdf["건물유형"],
    building_sdf["연면적(m2)"],
    building_sdf["냉방면적(m2)"],
    building_sdf["태양광용량(kW)"],
    building_sdf["ESS저장용량(kWh)"],
    building_sdf["PCS용량(kW)"]
) \
.withColumn("createTime", building_sdf["createTime"].cast(StringType())) \
.withColumn("건물번호", building_sdf["건물번호"].cast(IntegerType())) \
.withColumn("건물유형", building_sdf["건물유형"].cast(StringType())) \
.withColumn("연면적(m2)", building_sdf["연면적(m2)"].cast(DoubleType())) \
.withColumn("냉방면적(m2)", building_sdf["냉방면적(m2)"].cast(DoubleType())) \
.withColumn("태양광용량(kW)", building_sdf["태양광용량(kW)"].cast(DoubleType())) \
.withColumn("ESS저장용량(kWh)", building_sdf["ESS저장용량(kWh)"].cast(IntegerType())) \
.withColumn("PCS용량(kW)", building_sdf["PCS용량(kW)"].cast(IntegerType()))


In [9]:
building_sdf.show()

[Stage 0:>                                                          (0 + 3) / 3]

+--------------------+--------+--------------+----------+------------+--------------+----------------+-----------+
|          createTime|건물번호|      건물유형|연면적(m2)|냉방면적(m2)|태양광용량(kW)|ESS저장용량(kWh)|PCS용량(kW)|
+--------------------+--------+--------------+----------+------------+--------------+----------------+-----------+
|2024-05-13 11:15:...|      65|        아파트|  183839.0|         0.0|          null|            null|       null|
|2024-05-13 11:15:...|      14|      건물기타|  16844.16|    14102.92|          56.0|            null|       null|
|2024-05-16 13:12:...|      36|    데이터센터|   8816.49|     8816.49|          null|            null|       null|
|2024-05-13 11:15:...|      46|          병원|  85869.49|     78675.0|        100.56|            null|       null|
|2024-05-13 11:14:...|      30|        대학교|  155785.0|    106305.0|          20.0|            null|       null|
|2024-05-16 13:12:...|      40|백화점및아울렛|   58483.0|     40775.0|          null|            null|       null|
|2024-05-21 01:

                                                                                

In [10]:
# 그냥 가져오기
test_sdf = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", "my-cluster-kafka-bootstrap.kafka.svc:9092")
    .option("subscribe", "test2-jy")
    .option("kafka.group.id", "my_consumer_group")
    .load()
)  # 밀리초 단위 에포치 시간endingTimestamp
test_sdf = test_sdf.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS STRING)")
test_sdf = test_sdf.withColumnRenamed("timestamp", "createTime")
test_sdf = test_sdf.withColumn("value", from_json(test_sdf["value"], schema2))


for field in schema2.fields:
    test_sdf = test_sdf.withColumn(field.name, test_sdf["value." + field.name])
test_sdf = test_sdf.drop("value")
# 이거쓰면 df가 repartition_num 수만큼 쪼개져서 병렬처리가능한 상태가 됨.
test_sdf = test_sdf.repartition(repartition_num)


test_sdf.printSchema()

root
 |-- createTime: string (nullable = true)
 |-- num_date_time: string (nullable = true)
 |-- 건물번호: string (nullable = true)
 |-- 일시: string (nullable = true)
 |-- 기온(C): string (nullable = true)
 |-- 강수량(mm): string (nullable = true)
 |-- 풍속(m/s): string (nullable = true)
 |-- 습도(%): string (nullable = true)



In [11]:
test_sdf = test_sdf.select(
    test_sdf["createTime"],
    test_sdf["num_date_time"],
    test_sdf["건물번호"],
    test_sdf["일시"],
    test_sdf["기온(C)"],
    test_sdf["강수량(mm)"],
    test_sdf["풍속(m/s)"],
    test_sdf["습도(%)"]
) \
.withColumn("createTime", test_sdf["createTime"].cast(StringType())) \
.withColumn("num_date_time", test_sdf["num_date_time"].cast(StringType())) \
.withColumn("건물번호", test_sdf["건물번호"].cast(IntegerType())) \
.withColumn("일시", test_sdf["일시"].cast(StringType())) \
.withColumn("기온(C)", test_sdf["기온(C)"].cast(DoubleType())) \
.withColumn("강수량(mm)", test_sdf["강수량(mm)"].cast(DoubleType())) \
.withColumn("풍속(m/s)", test_sdf["풍속(m/s)"].cast(DoubleType())) \
.withColumn("습도(%)", test_sdf["습도(%)"].cast(IntegerType()))

In [12]:
test_sdf.show()

[Stage 5:>                                                          (0 + 3) / 3]

+--------------------+--------------+--------+-----------+-------+----------+---------+-------+
|          createTime| num_date_time|건물번호|       일시|기온(C)|강수량(mm)|풍속(m/s)|습도(%)|
+--------------------+--------------+--------+-----------+-------+----------+---------+-------+
|2024-05-21 01:23:...|74_20220827 08|      74|20220827 08|   18.3|       0.0|      2.1|     77|
|2024-05-21 01:23:...|57_20220827 03|      57|20220827 03|   20.7|       2.0|      2.7|     91|
|2024-05-21 01:23:...|24_20220827 09|      24|20220827 09|   19.5|       0.0|      2.4|     63|
|2024-05-21 01:23:...|94_20220829 04|      94|20220829 04|   21.8|       0.0|      1.3|     66|
|2024-05-21 01:23:...| 5_20220829 15|       5|20220829 15|   22.2|       0.0|      1.8|     73|
|2024-05-21 01:23:...|35_20220831 04|      35|20220831 04|   19.8|       0.0|      1.2|     94|
|2024-05-21 01:23:...|67_20220827 04|      67|20220827 04|   22.6|       0.0|      3.2|     79|
|2024-05-21 01:23:...|63_20220830 20|      63|20220830 

                                                                                

In [13]:
# 그냥 가져오기
train_sdf = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", "my-cluster-kafka-bootstrap.kafka.svc:9092")
    .option("subscribe", "test-jy")
    .option("kafka.group.id", "my_consumer_group")
    .load()
)  # 밀리초 단위 에포치 시간endingTimestamp

train_sdf = train_sdf.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS STRING)")
train_sdf = train_sdf.withColumnRenamed("timestamp", "createTime")
train_sdf = train_sdf.withColumn("value", from_json(train_sdf["value"], schema))


for field in schema.fields:
    train_sdf = train_sdf.withColumn(field.name, train_sdf["value." + field.name])
train_sdf = train_sdf.drop("value")
# 이거쓰면 df가 repartition_num 수만큼 쪼개져서 병렬처리가능한 상태가 됨.


train_sdf = train_sdf.repartition(repartition_num)

train_sdf.printSchema()


root
 |-- createTime: string (nullable = true)
 |-- num_date_time: string (nullable = true)
 |-- 건물번호: string (nullable = true)
 |-- 일시: string (nullable = true)
 |-- 기온(C): string (nullable = true)
 |-- 강수량(mm): string (nullable = true)
 |-- 풍속(m/s): string (nullable = true)
 |-- 습도(%): string (nullable = true)
 |-- 일조(hr): string (nullable = true)
 |-- 일사(MJ/m2): string (nullable = true)
 |-- 전력소비량(kWh): string (nullable = true)



In [14]:
train_sdf = train_sdf.select(
    train_sdf["createTime"],
    train_sdf["num_date_time"],
    train_sdf["건물번호"],
    train_sdf["일시"],
    train_sdf["기온(C)"],
    train_sdf["강수량(mm)"],
    train_sdf["풍속(m/s)"],
    train_sdf["습도(%)"],
    train_sdf["일조(hr)"],
    train_sdf["일사(MJ/m2)"],
    train_sdf["전력소비량(kWh)"],
) \
.withColumn("createTime", train_sdf["createTime"].cast(StringType())) \
.withColumn("num_date_time", train_sdf["num_date_time"].cast(StringType())) \
.withColumn("건물번호", train_sdf["건물번호"].cast(IntegerType())) \
.withColumn("일시", train_sdf["일시"].cast(StringType())) \
.withColumn("기온(C)", train_sdf["기온(C)"].cast(DoubleType())) \
.withColumn("강수량(mm)", train_sdf["강수량(mm)"].cast(DoubleType())) \
.withColumn("풍속(m/s)", train_sdf["풍속(m/s)"].cast(DoubleType())) \
.withColumn("습도(%)", train_sdf["습도(%)"].cast(IntegerType())) \
.withColumn("일조(hr)", train_sdf["일조(hr)"].cast(DoubleType())) \
.withColumn("일사(MJ/m2)", train_sdf["일사(MJ/m2)"].cast(DoubleType())) \
.withColumn("전력소비량(kWh)", train_sdf["전력소비량(kWh)"].cast(DoubleType()))

In [15]:
train_sdf.show()



+--------------------+--------------+--------+-----------+-------+----------+---------+-------+--------+-----------+---------------+
|          createTime| num_date_time|건물번호|       일시|기온(C)|강수량(mm)|풍속(m/s)|습도(%)|일조(hr)|일사(MJ/m2)|전력소비량(kWh)|
+--------------------+--------------+--------+-----------+-------+----------+---------+-------+--------+-----------+---------------+
|2024-05-21 01:18:...|81_20220706 17|      81|20220706 17|   33.2|      null|      2.1|     62|     0.3|       1.49|        2403.36|
|2024-05-21 01:18:...|69_20220615 05|      69|20220615 05|   15.2|       0.8|      2.2|     93|    null|       null|         3369.9|
|2024-05-21 01:18:...|76_20220806 16|      76|20220806 16|   34.5|      null|      1.6|     64|     0.6|        1.6|        1127.88|
|2024-05-21 01:17:...|26_20220620 05|      26|20220620 05|   20.6|      null|      0.3|     96|    null|       null|        2698.08|
|2024-05-21 01:17:...|41_20220729 20|      41|20220729 20|   30.9|       0.0|      2.4|     6

                                                                                

In [16]:
def train_test_split(df, th):
    df = df.na.drop(subset=['일시'])
    train = df.filter(col('일시').substr(1, 8).cast(IntegerType()) < th)
    test = df.filter(col('일시').substr(1, 8).cast(IntegerType()) >= th)
    return train, test

def preprocess_x(df):
    to_remove_columns = ['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)']
    df = df.fillna(0)
    
    # 시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
    df = df.withColumn('month', substring('일시', 5, 2).cast(IntegerType()))
    df = df.withColumn('day', substring('일시', 7, 2).cast(IntegerType()))
    df = df.withColumn('time', substring('일시', 10, 2).cast(IntegerType()))
    
    df = df.join(building_sdf.select('건물번호', '건물유형', '연면적(m2)'), on='건물번호', how='left')
    
    # '건물유형'을 카테고리형 코드로 변환
    building_type_indexer = StringIndexer(inputCol='건물유형', outputCol='건물유형_index')
    df = building_type_indexer.fit(df).transform(df)
    
    # 불필요한 컬럼 삭제
    for c in to_remove_columns:
        if c in df.columns:
            df = df.drop(c)
    return df


date_th = 20220820

In [17]:

train_df, valid_df = train_test_split(train_sdf, date_th)

# 데이터 분할 후 각 데이터프레임의 크기를 확인합니다.
print("Train DataFrame shape:", train_df.count(), len(train_df.columns))
print("Validation DataFrame shape:", valid_df.count(), len(valid_df.columns))

train_x = preprocess_x(train_df)
train_y = train_df.select("전력소비량(kWh)")

# 전처리 후 데이터프레임의 크기를 확인합니다.
print("Train X shape:", train_df.count(), len(train_df.columns))
print("Train Y shape:", train_y.count(), len(train_y.columns))

valid_x = preprocess_x(valid_df)
valid_y = valid_df.select("전력소비량(kWh)")

print("Validation X shape:", valid_df.count(), len(valid_df.columns))
print("Validation Y shape:", valid_y.count(), len(valid_y.columns))



                                                                                

Train DataFrame shape: 192000 11


                                                                                

Validation DataFrame shape: 12000 11


                                                                                

Train X shape: 192000 11


                                                                                

Train Y shape: 192000 1


                                                                                

Validation X shape: 12000 11




Validation Y shape: 12000 1


                                                                                

In [None]:
#spark.stop()

In [18]:
train_x.show()



+--------+--------------------+-------+----------+---------+-------+-----+---+----+------------+----------+--------------+
|건물번호|          createTime|기온(C)|강수량(mm)|풍속(m/s)|습도(%)|month|day|time|    건물유형|연면적(m2)|건물유형_index|
+--------+--------------------+-------+----------+---------+-------+-----+---+----+------------+----------+--------------+
|      25|2024-05-21 01:17:...|   23.7|       0.0|      2.8|     96|    7| 14|   3|      대학교| 528014.05|           2.0|
|      25|2024-05-21 01:17:...|   23.7|       0.0|      2.8|     96|    7| 14|   3|      대학교| 528014.05|           2.0|
|      25|2024-05-21 01:17:...|   23.7|       0.0|      2.8|     96|    7| 14|   3|      대학교| 528014.05|           2.0|
|      25|2024-05-21 01:17:...|   23.7|       0.0|      2.8|     96|    7| 14|   3|      대학교| 528014.05|           2.0|
|      25|2024-05-21 01:17:...|   23.7|       0.0|      2.8|     96|    7| 14|   3|      대학교| 528014.05|           2.0|
|      25|2024-05-21 01:17:...|   23.7|       0.0|     

                                                                                

In [19]:
train_y.show()



+---------------+
|전력소비량(kWh)|
+---------------+
|        5351.52|
|         1983.0|
|          917.4|
|        1009.08|
|        1276.02|
|        2023.68|
|         1117.8|
|          572.4|
|         2352.0|
|        3827.52|
|         2325.6|
|        1707.12|
|        3120.12|
|        2103.84|
|        9758.16|
|         4368.0|
|        3750.72|
|         416.64|
|         240.66|
|        1092.96|
+---------------+
only showing top 20 rows



                                                                                

### 0. Functions For validation

In [20]:
def SMAPE(y, pred):
    smape = abs((y - pred))/((abs(y) + abs(pred)) / 2) * 100
    smape = np.mean(smape)
    return smape

def mae(y, pred):
    return np.mean(abs(y-pred))

In [21]:
def validate(valid_x, valid_y, model):
    pred = model.predict(valid_x)
    smape_score, mae_score = SMAPE(valid_y, pred), mae(valid_y, pred)
    return smape_score, mae_score

- n_estimators값이 300정도에서 모델 성능이 괜찮은 것을 확인할 수 있습니다.

## 2. Multi Models by building num
- 건물번호(1~100)별 모델을 각각 만들어 성능을 측정하였습니다.
- 건물별로 building_info값은 같기 때문에 해당 데이터는 제외했습니다(건물번호, 건물유형, 연면적, 냉방면적).

In [22]:
def validate_multi(valid_x, valid_y, models):
    """
    Args:
        models: dict, {1: model1, 2: model2, ..., 100: model100}
    """
    preds = []
    for i in range(1, 101):
        _x = valid_x.filter(col('건물번호') == i) 
        _x = _x.drop('건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)')
        
        pred_df = models[i].transform(_x).select('predection')
        pred_list = [row.predeiction for row in pred_df.collect()]
        
        preds.extend(pred_list)
    
    preds = np.array(preds)
        
    
    smape_score = SMAPE(valid_y, preds_df)
    mae_score = MAE(valid_y, preds_df)
    
    return smape_score, mae_score

In [23]:
def train_multiple_models(train_x, train_y, n_estimators=100):
    models = {}
    
    
    train_y = train_y.withColumnRenamed('전력소비량(kWh)', 'label')
    
    for i in tqdm(range(1, 101)):
        _x = train_x.filter(col('건물번호') == i)
        _y = train_y.filter(col('건물번호') == i).select('label')
        
        _x = _x.drop('건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)', 'createTime')
        
        feature_cols = [c for c in _x.columns if c != 'label']
        assembler = VectorAssembler(inputCols = feature_cols, outputCol = "features")
        _x = assembler.transform(_x).select("features")
        
        train_data = _x.join(_y)
        
        rf = RandomForestRegressor(featuresCol='features', labelCol='label', numTrees=n_estimators)
        model = rf.fit(train_data)
        
        models[i] = model
        
        
    return models

In [24]:
models1 = train_multiple_models(train_x, train_y)



In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models1)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

#### 2-1 hyper parameter 조절
- 건물별 모델을 따로 만드는게 성능이 약간 더 좋았고(15.798 -> 13.13), multiple models의 경우 n_estimators를 작게 하는게 오히려 성능이 좋았습니다.
- 아마 모델별 데이터의 양이 작아져서 학습횟수를 크게하는게 과적합을 야기하는 것으로 생각됩니다.

In [None]:
models2 = train_multiple_models(train_x, train_y, 50)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models2)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

In [None]:
models3 = train_multiple_models(train_x, train_y, 300)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models3)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

## 3. Add Features

### A. Weekday
- [가벼운 DATA EDA입니다.](https://dacon.io/competitions/official/236125/codeshare/8500?page=1&dtype=recent)에서 주말과 주중 전력사용량이 유의미한 차이를 보임을 확인할 수 있었습니다. 그래서 날짜를 이용해 요일데이터를 추출해 feature로 사용하였습니다.

In [None]:
import datetime

In [None]:
def to_datetime(s):
    """
    Args:
        s: ex) '20220601 01'
    Returns:
        weekday: 0~6(int), 0: 월요일, 1: 화요일, ...
    """
    s = s.split()[0]  # 20220601
    date = datetime.datetime.strptime(s, '%Y%m%d')
    weekday = date.weekday()  # 
    return weekday

In [None]:
train_origin_ = train_origin.copy()

In [None]:
train_origin_['Weekday'] = train_origin_.apply(lambda x:to_datetime(x['일시']), axis=1)

In [None]:
train_origin_.sample(5, random_state=42)

In [None]:
# train, valid데이터 재생성
train_df, valid_df = train_test_split(train_origin_, 20220820)

train_x = preprocess_x(train_df)
train_y = train_df['전력소비량(kWh)']

valid_x = preprocess_x(valid_df)
valid_y = valid_df['전력소비량(kWh)']

In [None]:
models_f1 = train_multiple_models(train_x, train_y)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_f1)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

#### A-1 hyper paramter 조절
- weekday 데이터를 추가하니 validation SMAPE score가 13.13 -> 5.97 까지 하락하였습니다.
- 참고로 해당 모델(validation score=5.976...)로 만든 submission을 dacon에 제출하였을때가 **best score였으며, score는 6.768이었습니다.**

In [None]:
models_f2 = train_multiple_models(train_x, train_y, 50)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_f2)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

In [None]:
models_f3 = train_multiple_models(train_x, train_y, 300)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_f3)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

### B. 전날과의 기온, 풍속 차이
- 전 날과의 기온과 풍속, 습도의 차이값을 feature에 추가해보았습니다.
- weekday는 그대로 사용하였습니다(train_origin_)

In [None]:
train_origin_b = train_origin_.copy()

In [None]:
for i in range(1, 101):
    df = train_origin_b[train_origin_b['건물번호'] == i]
    train_origin_b.loc[df.index, '기온_gap'] = df['기온(C)'] - df.shift(1)['기온(C)']
    train_origin_b.loc[df.index, '풍속_gap'] = df['풍속(m/s)'] - df.shift(1)['풍속(m/s)']
    train_origin_b.loc[df.index, '습도_gap'] = df['습도(%)'] - df.shift(1)['습도(%)']

In [None]:
train_origin_b.head()

In [None]:
train_origin_b['기온_gap'] = train_origin_b['기온_gap'].fillna(0)
train_origin_b['풍속_gap'] = train_origin_b['풍속_gap'].fillna(0)
train_origin_b['습도_gap'] = train_origin_b['습도_gap'].fillna(0)

In [None]:
train_df, valid_df = train_test_split(train_origin_b, 20220820)

train_x = preprocess_x(train_df)
train_y = train_df['전력소비량(kWh)']

valid_x = preprocess_x(valid_df)
valid_y = valid_df['전력소비량(kWh)']

In [None]:
train_x.head()

In [None]:
models_b1 = train_multiple_models(train_x, train_y)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_b1)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

#### B-1 hyper paramter 조절
- 그냥 weekday와 추가한 모델과 결과에서 큰 차이는 없었습니다.

In [None]:
models_b2 = train_multiple_models(train_x, train_y, 50)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_b2)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

In [None]:
models_b3 = train_multiple_models(train_x, train_y, 30)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_b3)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

### C. 1주일 전 전력소모값
- test 데이터의 가장 마지막 날짜(8월31일)과 train 데이터의 가장 마지막 날짜(8월24일)의 gap이 7일이므로, test 데이터의 경우 과거 7일전 이상의 전력소모값은 접근할 수 있습니다.
- B에서 생성한 feature는 사용하지않고, weekday(train_origin_)만 사용하였습니다.

In [None]:
train_origin_c = train_origin_.copy()

In [None]:
n = 7*24 # 7*24시간 전 전력소모량
for i in range(1, 101):
    df = train_origin_c[train_origin_c['건물번호'] == i]
    train_origin_c.loc[df.index, f'{n}시간 전 전력소비량'] = df.shift(n)['전력소비량(kWh)']

In [None]:
train_df, valid_df = train_test_split(train_origin_c, 20220820)

train_x = preprocess_x(train_df)
train_y = train_df['전력소비량(kWh)']

valid_x = preprocess_x(valid_df)
valid_y = valid_df['전력소비량(kWh)']

In [None]:
train_x = train_x[train_x['168시간 전 전력소비량'] != 0]  # NaN이 0으로 replace되어 0이 아닌 row들 삭제
train_y = train_y[train_x.index]

In [None]:
train_x.head()

In [None]:
models_c1 = train_multiple_models(train_x, train_y)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_c1)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

In [None]:
models_c2 = train_multiple_models(train_x, train_y, 50)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_c2)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

In [None]:
models_c3 = train_multiple_models(train_x, train_y, 300)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_c3)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

- 오히려 score가 떨어졌습니다.

### D. 7일전 24시간동안의 전력소모량
- 8일전\~7일전(191시간전\~168시간전) 데이터를 feature로 추가해 사용해보았습니다.
- 위와 마찬가지로 weekday만 있는 모델에서 feature를 추가하였습니다.

In [None]:
train_origin_d = train_origin_.copy()

In [None]:
for i in range(1, 101):
    df = train_origin_d[train_origin_d['건물번호'] == i]
    fr, to = 7*24, 7*24+24
    for n in range(fr, to):
        train_origin_d.loc[df.index, f'{n}시간 전 전력소비량'] = df.shift(n)['전력소비량(kWh)']

In [None]:
train_df, valid_df = train_test_split(train_origin_d, 20220820)

train_x = preprocess_x(train_df)
train_y = train_df['전력소비량(kWh)']

valid_x = preprocess_x(valid_df)
valid_y = valid_df['전력소비량(kWh)']

In [None]:
train_x.head()

In [None]:
train_x = train_x[train_x['191시간 전 전력소비량'] != 0]  # NaN이 0으로 replace되어 0이 아닌 row들 삭제
train_y = train_y[train_x.index]

In [None]:
models_d1 = train_multiple_models(train_x, train_y)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_d1)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

In [None]:
models_d2 = train_multiple_models(train_x, train_y, 50)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_d2)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

In [None]:
models_d3 = train_multiple_models(train_x, train_y, 300)

In [None]:
smape_score, mae_score = validate_multi(valid_x, valid_y, models_d3)
print(f'SMAPE: {smape_score}\nMAE: {mae_score}')

- feature를 추가하는게 오히려 성능을 떨어뜨림을 확인할 수 있습니다.

## Make Submission
- 전체 학습 데이터셋을 이용해 모델 생성
- 3-A(multiple models, add weekday)모델을 사용하였습니다.

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission

train_origin['Weekday'] = train_origin.apply(lambda x:to_datetime(x['일시']), axis=1)
test_origin['Weekday'] = test_origin.apply(lambda x:to_datetime(x['일시']), axis=1)
train_x_full = preprocess_x(train_origin)
train_y_full = train_origin['전력소비량(kWh)']

models = train_multiple_models(train_x_full, train_y_full)

In [None]:
test_x_real = preprocess_x(test_origin)

In [None]:
preds_real = []
for i in tqdm(range(1, 101)):
    _x = test_x_real[test_x_real['건물번호'] == i]
    _x = _x.drop(columns=['건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)'])
    pred = models[i].predict(_x).tolist()
    preds_real.extend(pred)

In [None]:
submission['answer'] = preds_real

In [None]:
submission.head()

## 결론
- 건물별 100개의 모델을 따로 생성하고, weekday를 추가한 모델이 가장 좋은 성능을 보였습니다.
- validation score는 5.976, 실제 submission score는 6.76. 
- 두 값의 차이가 나는 이유는 best valiation score를 찾는 과정에서 약간의 overfitting이 발생하기도 했고, submssion에 사용하는 test 데이터의 설명변수(X)값들은 실제 값이 아닌 예측치라는 점도 영향을 미친것 같습니다.

## TODO
- 추가적인 feature를 고려해보았는데, 마땅히 떠오르는 아이디어가 없어서 모델의 hyper parameter 튜닝을 중점적으로 진행할 것 같습니다.
- 그리고 valid_y와 preds를 비교해 성능이 잘 안나오는 모델의 건물번호, 시간대 등을 분석해 모델을 보완해나가는게 필요할 것 같습니다.

## Epilog
- 3-A 모델에서 예측을 잘 못한 모델을 한번 뽑아봤습니다.

In [None]:
# train, valid데이터 재생성
train_df, valid_df = train_test_split(train_origin, 20220820)
train_x = preprocess_x(train_df)
train_y = train_df['전력소비량(kWh)']
valid_x = preprocess_x(valid_df)
valid_y = valid_df['전력소비량(kWh)']
models_f2 = train_multiple_models(train_x, train_y, 50)
preds = []
for i in range(1, 101):
    _x = valid_x[valid_x['건물번호'] == i]
    _x = _x.drop(columns=['건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)'])
    pred = models[i].predict(_x).tolist()
    preds.extend(pred)
preds = np.array(preds)

In [None]:
eda = valid_x.copy()
eda['pred'], eda['y'] = preds, valid_y
eda['gap'] = (eda['y'] - eda['pred']) / ((eda['y'] + eda['pred'])/2)  # SMAPE가 target이기 때문에 scale 고려

In [None]:
eda.sort_values('gap')

In [None]:
bads = eda.groupby('건물번호').agg(lambda x:np.mean(abs(x))).sort_values('gap').tail(5)
goods = eda.groupby('건물번호').agg(lambda x:np.mean(abs(x))).sort_values('gap').head(5)

In [None]:
bads

In [None]:
goods

- 95, 14번 건물 모델이 성능이 잘 안나왔고, 33, 32번 건물 모델 성능이 잘나왔네요.
- 95, 14번처럼 매 시각마다 전력사용량이 크게 바뀌는 건물의 경우 좀더 복잡한 모델을 사용하는게 점수가 잘나올 듯 합니다.

In [None]:
eda[eda['건물번호']==95][['y', 'pred']].plot()

In [None]:
eda[eda['건물번호']==14][['y', 'pred']].plot()

In [None]:
eda[eda['건물번호']==33][['y', 'pred']].plot()

In [None]:
eda[eda['건물번호']==32][['y', 'pred']].plot()