In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 

# 커널을 구성하다보면 에러는 아니지만, 빨간색 네모 박스 warning이 뜨는 경우가 많다. 
# 그런 보기 싫은 부분들을 제거해주겠다.
# import warnings
# warnings.filterwarnings('ignore')

# os 패키지를 통해 현재 디렉토리 위치를 변경하고, read_csv를 더 편리하게 할 수 있음
import os
os.getcwd() # 현재 디렉토리 파악
# os.chdir(r"______") # 불러오고 싶은 파일이 위치한 주소를 ___에 입력

# 다른 노트북 작성할 때도 이 셀만 떼서 사용 가능하다.
import platform                

# 웬만하면 해주는 것이 좋다
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin': #맥os 사용자의 경우에
    plt.style.use('seaborn-darkgrid') 
    rc('font', family = 'AppleGothic')
    
elif platform.system() == 'Windows':#윈도우 사용자의 경우에
    path = 'c:/Windows/Fonts/malgun.ttf'
    font_name = font_manager.FontProperties(fname=path).get_name()
    plt.style.use('seaborn-v0_8-whitegrid') # https://python-graph-gallery.com/199-matplotlib-style-sheets/
    rc('font', family=font_name)
    
# tqdm
from tqdm.notebook import tqdm

In [2]:
path = 'c:/reposit/data/dacon/2023 전력사용량 예측 AI 경진대회/'

# 데이터 로드
train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')
building_df = pd.read_csv(path + 'building_info.csv')

In [3]:
train_df.rename(columns={
    '건물번호': 'building_num',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

Unnamed: 0,num_date_time,building_num,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.40
...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,,0.9,86.0,0.5,,881.04
203996,100_20220824 20,100,20220824 20,22.4,,1.3,86.0,0.0,,798.96
203997,100_20220824 21,100,20220824 21,21.3,,1.0,92.0,,,825.12
203998,100_20220824 22,100,20220824 22,21.0,,0.3,94.0,,,640.08


In [10]:
len(building_df[building_df['태양광용량(kW)']!='-'])

36

In [9]:
building_df

Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,110634.00,39570.00,-,-,-
1,2,건물기타,122233.47,99000.00,-,-,-
2,3,건물기타,171243.00,113950.00,40,-,-
3,4,건물기타,74312.98,34419.62,60,-,-
4,5,건물기타,205884.00,150000.00,-,2557,1000
...,...,...,...,...,...,...,...
95,96,호텔및리조트,93314.00,60500.00,-,-,-
96,97,호텔및리조트,55144.67,25880.00,-,-,-
97,98,호텔및리조트,53578.62,17373.75,-,-,-
98,99,호텔및리조트,53499.00,40636.00,-,-,-


In [11]:
building_df['건물유형'].unique()

array(['건물기타', '공공', '대학교', '데이터센터', '백화점및아울렛', '병원', '상용', '아파트', '연구소',
       '지식산업센터', '할인마트', '호텔및리조트'], dtype=object)

In [12]:
test_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77
...,...,...,...,...,...,...,...
16795,100_20220831 19,100,20220831 19,22.5,0.0,0.9,84
16796,100_20220831 20,100,20220831 20,20.7,0.0,0.4,95
16797,100_20220831 21,100,20220831 21,20.2,0.0,0.4,98
16798,100_20220831 22,100,20220831 22,20.1,0.0,1.1,97


In [21]:
train_df['일시'] = pd.to_datetime(train_df['일시'])


0        2022-06-01 00:00:00
1        2022-06-01 01:00:00
2        2022-06-01 02:00:00
3        2022-06-01 03:00:00
4        2022-06-01 04:00:00
                 ...        
203995   2022-08-24 19:00:00
203996   2022-08-24 20:00:00
203997   2022-08-24 21:00:00
203998   2022-08-24 22:00:00
203999   2022-08-24 23:00:00
Name: 일시, Length: 204000, dtype: datetime64[ns]

In [23]:
train_df['일시'].dt.strftime('%Y-%m-%d')

0         2022-06-01
1         2022-06-01
2         2022-06-01
3         2022-06-01
4         2022-06-01
             ...    
203995    2022-08-24
203996    2022-08-24
203997    2022-08-24
203998    2022-08-24
203999    2022-08-24
Name: 일시, Length: 204000, dtype: object

In [24]:
train_df['일시'].dt.strftime('%Y%m%d')

0         20220601
1         20220601
2         20220601
3         20220601
4         20220601
            ...   
203995    20220824
203996    20220824
203997    20220824
203998    20220824
203999    20220824
Name: 일시, Length: 204000, dtype: object

In [14]:
train_df['일시']

0         20220601 00
1         20220601 01
2         20220601 02
3         20220601 03
4         20220601 04
             ...     
203995    20220824 19
203996    20220824 20
203997    20220824 21
203998    20220824 22
203999    20220824 23
Name: 일시, Length: 204000, dtype: object