In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
import time as timer
import argparse
import datetime
import json
from pyspark.sql.functions import col, abs, mean, expr, substring, udf
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

import random
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
#import lightgbm as lgb

os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.driver.extraJavaOptions=-Dio.netty.tryReflectionSetAccessible=true --conf spark.executor.extraJavaOptions=-Dio.netty.tryReflectionSetAccessible=true pyspark-shell'

schema = StructType(
    [
        StructField("num_date_time", StringType()),
        StructField("건물번호", StringType()),
        StructField("일시", StringType()),
        StructField("기온(C)", StringType()),
        StructField("강수량(mm)", StringType()),
        StructField("풍속(m/s)", StringType()),
        StructField("습도(%)", StringType()),
        StructField("일조(hr)", StringType()),
        StructField("일사(MJ/m2)", StringType()),
        StructField("전력소비량(kWh)", StringType()),
    ]
)

schema2 = StructType(
    [
        StructField("num_date_time", StringType()),
        StructField("건물번호", StringType()),
        StructField("일시", StringType()),
        StructField("기온(C)", StringType()),
        StructField("강수량(mm)", StringType()),
        StructField("풍속(m/s)", StringType()),
        StructField("습도(%)", StringType()),
    ]
)

schema3 = StructType(
    [
        StructField("건물번호", StringType()),
        StructField("건물유형", StringType()),
        StructField("연면적(m2)", StringType()),
        StructField("냉방면적(m2)", StringType()),
        StructField("태양광용량(kW)", StringType()),
        StructField("ESS저장용량(kWh)", StringType()),
        StructField("PCS용량(kW)", StringType()),
    ]
)



print("FILES IN THIS DIRECTORY")
print(os.listdir(os.getcwd()))

FILES IN THIS DIRECTORY
['.bashrc', '.bash_logout', '.profile', '.ipython', '.cache', '.npm', '.bash_history', '.local', '.ipynb_checkpoints', 'config.json', '.jupyter', 'jars', '.conda', '.config', '.wget-hsts', 'work']


## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
#config.json 파일 읽기
with open("config.json", "r") as f:
   config = json.load(f)

jar_urls = ",".join(config["KAFKA_JAR_URLS"])
repartition_num = config["NUM_EXECUTORS"] * config["EXECUTOR_CORES"] * 2



In [4]:
# SparkSession 생성
spark = (
    SparkSession.builder.master("spark://spark-master-service:7077")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.driver.host", "10.42.2.18")
    .config("spark.driver.port", "39337")
    .config("spark.cores.max", "48")
    .config("spark.network.timeout", "600s")
    .config("spark.executor.instances", config["NUM_EXECUTORS"])
    .config("spark.executor.cores", config["EXECUTOR_CORES"])
    .config("spark.executor.memory", config["EXECUTOR_MEMORY"])   
    .config("spark.driver.memory", "30g")
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size","20g")
    
    .config("spark.defaul.parallelism", repartition_num)
    .config("spark.sql.shuffle.partitions", repartition_num)
    .config("spark.driver.extraJavaOptions", "--illegal-access=permit")
    .config("spark.executor.extraJavaOptions", "--illegal-access=permit")
    .config("spark.jars", jar_urls)  # JAR 파일 포함
    .appName("asdf")
    .getOrCreate()
)

sc = spark.sparkContext
sc.setLogLevel("ERROR")

24/05/26 05:16:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
sc

In [6]:
print("Current Spark configuration:")
for key, value in sorted(sc._conf.getAll(), key=lambda x: x[0]):
    print(f"{key} = {value}")

Current Spark configuration:
spark.app.id = app-20240526051644-0405
spark.app.initial.jar.urls = spark://10.42.2.18:39337/jars/commons-pool2-2.6.2.jar,spark://10.42.2.18:39337/jars/hadoop-client-api-3.3.1.jar,spark://10.42.2.18:39337/jars/kafka-clients-2.8.1.jar,spark://10.42.2.18:39337/jars/spark-sql-kafka-0-10_2.12-3.2.4.jar,spark://10.42.2.18:39337/jars/hadoop-client-runtime-3.3.1.jar,spark://10.42.2.18:39337/jars/htrace-core4-4.1.0-incubating.jar,spark://10.42.2.18:39337/jars/commons-logging-1.1.3.jar,spark://10.42.2.18:39337/jars/spark-streaming-kafka-0-10_2.12-3.2.4.jar,spark://10.42.2.18:39337/jars/jsr305-3.0.0.jar,spark://10.42.2.18:39337/jars/spark-token-provider-kafka-0-10_2.12-3.2.4.jar
spark.app.name = asdf
spark.app.startTime = 1716700603983
spark.cores.max = 48
spark.defaul.parallelism = 96
spark.driver.bindAddress = 0.0.0.0
spark.driver.extraJavaOptions = --illegal-access=permit
spark.driver.host = 10.42.2.18
spark.driver.memory = 30g
spark.driver.port = 39337
spark.exec

In [7]:
# 그냥 가져오기
building_sdf = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", "my-cluster-kafka-bootstrap.kafka.svc:9092")
    .option("subscribe", "building-jy")
    .option("kafka.group.id", "my_consumer_group")
    .load()
)  # 밀리초 단위 에포치 시간endingTimestamp
building_sdf = building_sdf.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS STRING)")
building_sdf = building_sdf.withColumnRenamed("timestamp", "createTime")
building_sdf = building_sdf.withColumn("value", from_json(building_sdf["value"], schema3))


for field in schema3.fields:
    building_sdf = building_sdf.withColumn(field.name, building_sdf["value." + field.name])
building_sdf = building_sdf.drop("value")
# 이거쓰면 df가 repartition_num 수만큼 쪼개져서 병렬처리가능한 상태가 됨.
building_sdf = building_sdf.repartition(repartition_num)


building_sdf.printSchema()

root
 |-- createTime: string (nullable = true)
 |-- 건물번호: string (nullable = true)
 |-- 건물유형: string (nullable = true)
 |-- 연면적(m2): string (nullable = true)
 |-- 냉방면적(m2): string (nullable = true)
 |-- 태양광용량(kW): string (nullable = true)
 |-- ESS저장용량(kWh): string (nullable = true)
 |-- PCS용량(kW): string (nullable = true)



In [8]:
building_sdf = building_sdf.select(
    building_sdf["createTime"],
    building_sdf["건물번호"],
    building_sdf["건물유형"],
    building_sdf["연면적(m2)"],
    building_sdf["냉방면적(m2)"],
    building_sdf["태양광용량(kW)"],
    building_sdf["ESS저장용량(kWh)"],
    building_sdf["PCS용량(kW)"]
) \
.withColumn("createTime", building_sdf["createTime"].cast(StringType())) \
.withColumn("건물번호", building_sdf["건물번호"].cast(IntegerType())) \
.withColumn("건물유형", building_sdf["건물유형"].cast(StringType())) \
.withColumn("연면적(m2)", building_sdf["연면적(m2)"].cast(DoubleType())) \
.withColumn("냉방면적(m2)", building_sdf["냉방면적(m2)"].cast(DoubleType())) \
.withColumn("태양광용량(kW)", building_sdf["태양광용량(kW)"].cast(DoubleType())) \
.withColumn("ESS저장용량(kWh)", building_sdf["ESS저장용량(kWh)"].cast(IntegerType())) \
.withColumn("PCS용량(kW)", building_sdf["PCS용량(kW)"].cast(IntegerType()))


In [9]:
building_sdf.show()

                                                                                

+--------------------+--------+--------------+-----------+------------+--------------+----------------+-----------+
|          createTime|건물번호|      건물유형| 연면적(m2)|냉방면적(m2)|태양광용량(kW)|ESS저장용량(kWh)|PCS용량(kW)|
+--------------------+--------+--------------+-----------+------------+--------------+----------------+-----------+
|2024-05-21 01:23:...|      20|          공공|1.2872877E7| 1.0941945E7|          null|            null|       null|
|2024-05-21 01:24:...|      34|    데이터센터|    10665.0|      9402.0|          null|            null|       null|
|2024-05-21 01:24:...|       8|      건물기타|   75344.54|     24117.0|          null|            null|       null|
|2024-05-21 01:24:...|      33|    데이터센터|    28059.0|     20397.0|          null|            null|       null|
|2024-05-21 01:23:...|      31|        대학교|   201781.0|    119836.0|         83.72|            null|       null|
|2024-05-21 01:24:...|      44|백화점및아울렛|   78116.83|    76458.32|          null|            null|       null|
|2024-05

In [10]:
# 그냥 가져오기
test_sdf = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", "my-cluster-kafka-bootstrap.kafka.svc:9092")
    .option("subscribe", "test2-jy")
    .option("kafka.group.id", "my_consumer_group")
    .load()
)  # 밀리초 단위 에포치 시간endingTimestamp
test_sdf = test_sdf.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS STRING)")
test_sdf = test_sdf.withColumnRenamed("timestamp", "createTime")
test_sdf = test_sdf.withColumn("value", from_json(test_sdf["value"], schema2))


for field in schema2.fields:
    test_sdf = test_sdf.withColumn(field.name, test_sdf["value." + field.name])
test_sdf = test_sdf.drop("value")
# 이거쓰면 df가 repartition_num 수만큼 쪼개져서 병렬처리가능한 상태가 됨.
test_sdf = test_sdf.repartition(repartition_num)


test_sdf.printSchema()

root
 |-- createTime: string (nullable = true)
 |-- num_date_time: string (nullable = true)
 |-- 건물번호: string (nullable = true)
 |-- 일시: string (nullable = true)
 |-- 기온(C): string (nullable = true)
 |-- 강수량(mm): string (nullable = true)
 |-- 풍속(m/s): string (nullable = true)
 |-- 습도(%): string (nullable = true)



In [11]:
test_sdf = test_sdf.select(
    test_sdf["createTime"],
    test_sdf["num_date_time"],
    test_sdf["건물번호"],
    test_sdf["일시"],
    test_sdf["기온(C)"],
    test_sdf["강수량(mm)"],
    test_sdf["풍속(m/s)"],
    test_sdf["습도(%)"]
) \
.withColumn("createTime", test_sdf["createTime"].cast(StringType())) \
.withColumn("num_date_time", test_sdf["num_date_time"].cast(StringType())) \
.withColumn("건물번호", test_sdf["건물번호"].cast(IntegerType())) \
.withColumn("일시", test_sdf["일시"].cast(StringType())) \
.withColumn("기온(C)", test_sdf["기온(C)"].cast(DoubleType())) \
.withColumn("강수량(mm)", test_sdf["강수량(mm)"].cast(DoubleType())) \
.withColumn("풍속(m/s)", test_sdf["풍속(m/s)"].cast(DoubleType())) \
.withColumn("습도(%)", test_sdf["습도(%)"].cast(IntegerType()))

In [12]:
test_sdf.show()

[Stage 7:>                                                          (0 + 3) / 3]

+--------------------+--------------+--------+-----------+-------+----------+---------+-------+
|          createTime| num_date_time|건물번호|       일시|기온(C)|강수량(mm)|풍속(m/s)|습도(%)|
+--------------------+--------------+--------+-----------+-------+----------+---------+-------+
|2024-05-21 01:23:...|74_20220827 08|      74|20220827 08|   18.3|       0.0|      2.1|     77|
|2024-05-21 01:23:...|57_20220827 03|      57|20220827 03|   20.7|       2.0|      2.7|     91|
|2024-05-21 01:23:...|24_20220827 09|      24|20220827 09|   19.5|       0.0|      2.4|     63|
|2024-05-21 01:23:...|94_20220829 04|      94|20220829 04|   21.8|       0.0|      1.3|     66|
|2024-05-21 01:23:...| 5_20220829 15|       5|20220829 15|   22.2|       0.0|      1.8|     73|
|2024-05-21 01:23:...|35_20220831 04|      35|20220831 04|   19.8|       0.0|      1.2|     94|
|2024-05-21 01:23:...|67_20220827 04|      67|20220827 04|   22.6|       0.0|      3.2|     79|
|2024-05-21 01:23:...|63_20220830 20|      63|20220830 

                                                                                

In [13]:
# 그냥 가져오기
train_sdf = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", "my-cluster-kafka-bootstrap.kafka.svc:9092")
    .option("subscribe", "test-jy")
    .option("kafka.group.id", "my_consumer_group")
    .load()
)  # 밀리초 단위 에포치 시간endingTimestamp

train_sdf = train_sdf.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS STRING)")
train_sdf = train_sdf.withColumnRenamed("timestamp", "createTime")
train_sdf = train_sdf.withColumn("value", from_json(train_sdf["value"], schema))


for field in schema.fields:
    train_sdf = train_sdf.withColumn(field.name, train_sdf["value." + field.name])
train_sdf = train_sdf.drop("value")
# 이거쓰면 df가 repartition_num 수만큼 쪼개져서 병렬처리가능한 상태가 됨.


train_sdf = train_sdf.repartition(repartition_num)

train_sdf.printSchema()


root
 |-- createTime: string (nullable = true)
 |-- num_date_time: string (nullable = true)
 |-- 건물번호: string (nullable = true)
 |-- 일시: string (nullable = true)
 |-- 기온(C): string (nullable = true)
 |-- 강수량(mm): string (nullable = true)
 |-- 풍속(m/s): string (nullable = true)
 |-- 습도(%): string (nullable = true)
 |-- 일조(hr): string (nullable = true)
 |-- 일사(MJ/m2): string (nullable = true)
 |-- 전력소비량(kWh): string (nullable = true)



In [14]:
train_sdf = train_sdf.select(
    train_sdf["createTime"],
    train_sdf["num_date_time"],
    train_sdf["건물번호"],
    train_sdf["일시"],
    train_sdf["기온(C)"],
    train_sdf["강수량(mm)"],
    train_sdf["풍속(m/s)"],
    train_sdf["습도(%)"],
    train_sdf["일조(hr)"],
    train_sdf["일사(MJ/m2)"],
    train_sdf["전력소비량(kWh)"],
) \
.withColumn("createTime", train_sdf["createTime"].cast(StringType())) \
.withColumn("num_date_time", train_sdf["num_date_time"].cast(StringType())) \
.withColumn("건물번호", train_sdf["건물번호"].cast(IntegerType())) \
.withColumn("일시", train_sdf["일시"].cast(StringType())) \
.withColumn("기온(C)", train_sdf["기온(C)"].cast(DoubleType())) \
.withColumn("강수량(mm)", train_sdf["강수량(mm)"].cast(DoubleType())) \
.withColumn("풍속(m/s)", train_sdf["풍속(m/s)"].cast(DoubleType())) \
.withColumn("습도(%)", train_sdf["습도(%)"].cast(IntegerType())) \
.withColumn("일조(hr)", train_sdf["일조(hr)"].cast(DoubleType())) \
.withColumn("일사(MJ/m2)", train_sdf["일사(MJ/m2)"].cast(DoubleType())) \
.withColumn("전력소비량(kWh)", train_sdf["전력소비량(kWh)"].cast(DoubleType()))

In [15]:
train_sdf.show()



+--------------------+--------------+--------+-----------+-------+----------+---------+-------+--------+-----------+---------------+
|          createTime| num_date_time|건물번호|       일시|기온(C)|강수량(mm)|풍속(m/s)|습도(%)|일조(hr)|일사(MJ/m2)|전력소비량(kWh)|
+--------------------+--------------+--------+-----------+-------+----------+---------+-------+--------+-----------+---------------+
|2024-05-21 01:17:...|15_20220606 04|      15|20220606 04|   16.9|       0.2|      0.6|    100|    null|       null|        1287.18|
|2024-05-21 01:18:...|85_20220605 13|      85|20220605 13|   24.8|      null|      3.7|     50|     0.0|       0.53|        2337.84|
|2024-05-21 01:17:...|37_20220617 00|      37|20220617 00|   18.7|      null|      1.3|     80|    null|       null|         1035.6|
|2024-05-21 01:18:...|81_20220808 19|      81|20220808 19|   25.6|      12.5|      2.5|     96|     0.0|       0.04|        2521.08|
|2024-05-21 01:17:...|22_20220607 14|      22|20220607 14|   22.5|      null|      2.5|     5

                                                                                

In [37]:
def train_test_split(df, th):
    df = df.na.drop(subset=['일시'])
    train = df.filter(col('일시').substr(1, 8).cast(IntegerType()) < th)
    test = df.filter(col('일시').substr(1, 8).cast(IntegerType()) >= th)
    return train, test

def preprocess_x(df):
    to_remove_columns = ['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)']
    df = df.fillna(0)   
    
    # 시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
    df = df.withColumn('month', substring('일시', 5, 2).cast(IntegerType()))
    df = df.withColumn('day', substring('일시', 7, 2).cast(IntegerType()))
    df = df.withColumn('time', substring('일시', 10, 2).cast(IntegerType()))
    
    
    
    df = df.join(building_sdf.select('건물번호', '건물유형', '연면적(m2)'), on='건물번호', how='left')
    df = df.dropDuplicates()
    
    
    # '건물유형'을 카테고리형 코드로 변환
    building_type_indexer = StringIndexer(inputCol='건물유형', outputCol='건물유형_index')
    df = building_type_indexer.fit(df).transform(df)
    df = df.drop('건물유형').withColumnRenamed('건물유형_index', '건물유형')
    
    # 불필요한 컬럼 삭제
    for c in to_remove_columns:
        if c in df.columns:
            df = df.drop(c)
            
    df.show(20, truncate=False)
    return df


date_th = 20220820

In [39]:

train_df, valid_df = train_test_split(train_sdf, date_th)


#print("train_df DataFrame show:", train_df.show())
#print("valid_df DataFrame show:", valid_df.show())

# 데이터 분할 후 각 데이터프레임의 크기를 확인합니다.
print("train_df shape(split 후):", train_df.count(), len(train_df.columns))
print("valid_df shape(split 후):", valid_df.count(), len(valid_df.columns))



                                                                                

train_df shape(split 후): 192000 11


[Stage 534:>                                                        (0 + 3) / 3]

valid_df shape(split 후): 12000 11


                                                                                

In [101]:

train_1 = preprocess_x(train_df)
#train_y = train_df.select("전력소비량(kWh)")

# 전처리 후 데이터프레임의 크기를 확인합니다.
print("train_1 shape:", train_1.count(), len(train_1.columns))
#print("Train Y shape:", train_y.count(), len(train_y.columns))

valid_1 = preprocess_x(valid_df)
#valid_y = valid_df.select("전력소비량(kWh)")

print("valid_1 shape:", valid_1.count(), len(valid_1.columns))
#print("Validation Y shape:", valid_y.count(), len(valid_y.columns))


                                                                                

+--------+-----------------------+-------+----------+---------+-------+---------------+-----+---+----+----------+--------+
|건물번호|createTime             |기온(C)|강수량(mm)|풍속(m/s)|습도(%)|전력소비량(kWh)|month|day|time|연면적(m2)|건물유형|
+--------+-----------------------+-------+----------+---------+-------+---------------+-----+---+----+----------+--------+
|23      |2024-05-21 01:17:22.13 |22.1   |0.0       |1.5      |88     |1054.8         |8    |19 |4   |32236.11  |1.0     |
|42      |2024-05-21 01:17:44.682|26.1   |0.0       |4.3      |74     |2929.32        |6    |25 |16  |97915.1   |3.0     |
|79      |2024-05-21 01:18:30.428|22.4   |0.0       |1.4      |96     |3010.56        |8    |18 |7   |212995.84 |8.0     |
|39      |2024-05-21 01:17:41.702|27.9   |0.0       |1.0      |80     |711.36         |8    |5  |23  |126835.0  |3.0     |
|64      |2024-05-21 01:18:11.493|24.6   |0.0       |1.5      |87     |1091.7         |7    |18 |8   |240551.0  |6.0     |
|24      |2024-05-21 01:17:22.423|19.8   

                                                                                

train_1 shape: 192000 12
+--------+-----------------------+-------+----------+---------+-------+---------------+-----+---+----+----------+--------+
|건물번호|createTime             |기온(C)|강수량(mm)|풍속(m/s)|습도(%)|전력소비량(kWh)|month|day|time|연면적(m2)|건물유형|
+--------+-----------------------+-------+----------+---------+-------+---------------+-----+---+----+----------+--------+
|59      |2024-05-21 01:18:06.081|24.8   |0.0       |1.3      |87     |3519.72        |8    |23 |10  |105500.72 |5.0     |
|97      |2024-05-21 01:18:51.815|27.9   |0.0       |2.9      |80     |1562.94        |8    |21 |19  |55144.67  |10.0    |
|86      |2024-05-21 01:18:38.789|26.0   |0.0       |1.6      |98     |649.08         |8    |20 |6   |54866.51  |9.0     |
|38      |2024-05-21 01:17:40.795|25.8   |0.0       |1.4      |83     |3213.0         |8    |23 |11  |58763.89  |3.0     |
|49      |2024-05-21 01:17:53.502|24.6   |0.0       |0.0      |99     |2731.68        |8    |21 |2   |112953.61 |4.0     |
|49      |2024-0

                                                                                

valid_1 shape: 12000 12


In [None]:
#spark.stop()

In [41]:
train_1.show()



+--------+--------------------+-------+----------+---------+-------+---------------+-----+---+----+----------+--------+
|건물번호|          createTime|기온(C)|강수량(mm)|풍속(m/s)|습도(%)|전력소비량(kWh)|month|day|time|연면적(m2)|건물유형|
+--------+--------------------+-------+----------+---------+-------+---------------+-----+---+----+----------+--------+
|       1|2024-05-21 01:16:...|   19.9|       0.0|      1.5|     89|        2010.72|    6| 15|  19|  110634.0|     0.0|
|      53|2024-05-21 01:17:...|   23.5|       0.0|      0.9|     78|         900.96|    6| 23|   3|  149246.0|     5.0|
|       5|2024-05-21 01:17:...|   23.2|       0.0|      1.0|     93|         1688.4|    7| 17|   1|  205884.0|     0.0|
|      35|2024-05-21 01:17:...|   27.9|       0.0|      0.8|     78|         2190.0|    7|  5|   9|    9736.0|    11.0|
|      13|2024-05-21 01:17:...|   23.3|       0.0|      5.3|    100|         2186.4|    6| 23|  10|    5578.4|     0.0|
|      99|2024-05-21 01:18:...|   22.5|       0.0|      2.3|     

                                                                                

In [102]:
valid_1.show()

[Stage 5498:>                                                       (0 + 3) / 3]

+--------+--------------------+-------+----------+---------+-------+---------------+-----+---+----+----------+--------+
|건물번호|          createTime|기온(C)|강수량(mm)|풍속(m/s)|습도(%)|전력소비량(kWh)|month|day|time|연면적(m2)|건물유형|
+--------+--------------------+-------+----------+---------+-------+---------------+-----+---+----+----------+--------+
|      49|2024-05-21 01:17:...|   26.3|       0.0|      1.0|     99|        3003.48|    8| 20|  20| 112953.61|     4.0|
|      53|2024-05-21 01:17:...|   27.7|       0.0|      2.2|     61|        3296.16|    8| 24|  12|  149246.0|     5.0|
|      56|2024-05-21 01:18:...|   22.5|       0.0|      2.7|     84|        4158.36|    8| 24|   6|  42666.54|     5.0|
|      34|2024-05-21 01:17:...|   26.3|       0.0|      3.2|     95|        3475.44|    8| 20|   2|   10665.0|    11.0|
|      95|2024-05-21 01:18:...|   19.9|       0.0|      0.1|     92|         928.08|    8| 24|  22| 196992.54|    10.0|
|      88|2024-05-21 01:18:...|   24.3|       0.0|      1.9|     

                                                                                

### 0. Functions For validation

In [42]:
def SMAPE(y, pred):
    smape = abs((y - pred))/((abs(y) + abs(pred)) / 2) * 100
    smape = np.mean(smape)
    return smape

def mae(y, pred):
    return np.mean(abs(y-pred))

In [43]:
def validate(valid_x, valid_y, model):
    pred = model.predict(valid_x)
    smape_score, mae_score = SMAPE(valid_y, pred), mae(valid_y, pred)
    return smape_score, mae_score

## 2. Multi Models by building num
- 건물번호(1~100)별 모델을 각각 만들어 성능을 측정하였습니다.
- 건물별로 building_info값은 같기 때문에 해당 데이터는 제외했습니다(건물번호, 건물유형, 연면적, 냉방면적).

In [110]:
from pyspark.ml.evaluation import RegressionEvaluator

def validate_multi(valid_1, models):
    """
    Args:
        models: dict, {1: model1, 2: model2, ..., 100: model100}
    """
    
    mse_eval = RegressionEvaluator(labelCol='전력소비량(kWh)', predictionCol='prediction', metricName='mse')
    rmse_eval = RegressionEvaluator(labelCol='전력소비량(kWh)', predictionCol='prediction', metricName='rmse')
    r2_eval = RegressionEvaluator(labelCol='전력소비량(kWh)', predictionCol='prediction', metricName='r2')

    predictions=[0 for _ in range(101)]
    for i in tqdm(range(1, 101)):
        aB = valid_1.filter(col('건물번호') == i)
        
        aB = aB.drop('건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)', 'createTime')
        
        feature_cols = [c for c in aB.columns if c != '전력소비량(kWh)']
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        aB = assembler.transform(aB).select("features", "전력소비량(kWh)")
        
        
        predictions[i] = models[i].transform(aB)
        print('mse:', mse_eval.evaluate(predictions[i]), 'rmse:', rmse_eval.evaluate(predictions[i]), 'r2:', r2_eval.evaluate(predictions[i]))
        


In [114]:
def train_multiple_models(train_1, n_estimators=100):
    models = {}
    
    for i in tqdm(range(1, 101)):
        aBuilding = train_1.filter(col('건물번호') == i)
        
        aBuilding = aBuilding.drop('건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)', 'createTime')
        
        #feature 벡터화
        feature_cols = [c for c in aBuilding.columns if c != '전력소비량(kWh)']
        assembler = VectorAssembler(inputCols = feature_cols, outputCol = "features")
        aBuilding = assembler.transform(aBuilding).select("features", "전력소비량(kWh)")
        
        rf = RandomForestRegressor(featuresCol='features', labelCol='전력소비량(kWh)', numTrees=n_estimators)
        aBuilding = aBuilding.repartition(200)
        model = rf.fit(aBuilding)
        
        models[i] = model
        
        
    return models

In [115]:
models1 = train_multiple_models(train_1)

  9%|▉         | 9/100 [01:59<20:05, 13.25s/it]                                 

In [112]:
validate_multi(valid_1, models1)


  1%|          | 1/100 [00:21<36:14, 21.96s/it]                                 

mse: 161828.72497804186 rmse: 402.27941157613554 r2: 0.8377836417221233


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
  1%|          | 1/100 [00:24<39:57, 24.22s/it]


KeyboardInterrupt: 

                                                                                