In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [2]:
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)

In [3]:
# 导入i3e数据
i3e_data = pd.read_csv('/Users/han/Desktop/Dissertation/Heating-System-Analysis/i3e_data.csv')
i3e_data.head()

Unnamed: 0,Date,"М1, t","М2, t","ΔМ, t","Т1, °C","Т2, °C","ΔТ, °C","Q, Gcal",USPD,YYYYMM,registrated,scheme,type,area,floors,walls material,year of construction,area of building,"temp,˚C"
0,2013/12/1,93.95,84.17,9.78,65.86,43.56,22.3,2.1,1,201312,Heating + Hot water,opened,1105,2707.4,5,panel,1983,4401.0,-0.157917
1,2013/12/2,91.28,81.69,9.59,65.18,42.87,22.31,2.04,1,201312,Heating + Hot water,opened,1105,2707.4,5,panel,1983,4401.0,-1.259167
2,2013/12/3,91.76,84.87,6.9,65.7,43.17,22.53,2.07,1,201312,Heating + Hot water,opened,1105,2707.4,5,panel,1983,4401.0,-4.739167
3,2013/12/4,88.79,79.92,8.87,65.26,42.23,23.03,2.05,1,201312,Heating + Hot water,opened,1105,2707.4,5,panel,1983,4401.0,-1.21625
4,2013/12/5,89.37,81.5,7.87,65.47,42.22,23.25,2.08,1,201312,Heating + Hot water,opened,1105,2707.4,5,panel,1983,4401.0,-0.53375


* (A) Date - date in Windows format.
* (B) M1, t - mass of the input water (heat carrier) per day.
* (C) М2, t - mass of the output water. If the residential building has the open heating system (hot water is flowed from the heating system), (C) less than (B).
* (D) ΔМ, t - difference in volumes (C)-(B). For buildings with the opened heating system this is the data for analysis. In closed system it is the technological parameter allows observation for equipment.
* (E) Т1, °C - average temperature of the heating carrier in the input of the heating system. It is the independent variable from home characteristics.
* (F) Т2, °C - average temperature of the heating carrier in the output. It is the dependent variable both from (E) and heating consumption at building.
* (G) ΔТ, °C - temperature difference, (F)-(E).
* (H) Q, Gcal - amount of the consumed heating in Gcal. It is calculated by formula (H)=(B)*[(E)-(F)].
* (I) USPD - ID of the heating meter. Some residential buildings have many heating meters.
* (J) YYYYMM - date in the format year-month YYYYMM.
* (K) registrated - what is registrated, heating or heating plus hot water.
* (L) scheme - type of the heating system (opened or closed).
* (M) type - code system-load (4 digits). First digit 1 is opened system, 2 is closed system. The second digit 0 is heating, 1 is heating and hot water supply. The third and fourth digits is floors amount (01, 02, 03, ..., 17).  M = L + K + O
* (N) area - area of building that heating meter is served.
* (O) floors - amount of building floors. 定序
* (P) walls material - walls material.
* (Q) year of construction - year of building construction.
* (R) area of building - total area of building.
* (S) temp,˚C - outdoor temperature.

In [4]:
i3e_data.describe()

Unnamed: 0,"М1, t","М2, t","ΔМ, t","Т1, °C","Т2, °C","ΔТ, °C","Q, Gcal",USPD,YYYYMM,type,area,floors,year of construction,area of building,"temp,˚C"
count,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0,700036.0
mean,140.595253,136.269521,4.325733,74.912498,50.379915,24.532585,3.280713,1117.610656,201515.847402,1652.286488,3438.819815,6.517842,1978.607146,7116.133263,-5.944104
std,116.962725,115.807347,10.418085,10.712302,6.081111,7.725019,2.549757,794.009423,114.373003,474.922161,2307.957039,2.331796,12.961497,6380.413599,10.097453
min,0.26,0.01,-26.4,40.01,6.38,2.0,0.01,1.0,201312.0,1101.0,121.8,1.0,1882.0,121.8,-33.521342
25%,74.02,70.52,-0.16,66.52,46.3,19.31,1.75,467.0,201405.0,1105.0,2128.8,5.0,1970.0,3400.0,-12.586472
50%,112.55,107.2,2.21,73.02,49.88,23.92,2.65,987.0,201504.0,2005.0,2896.92,5.0,1979.0,4767.0,-5.182468
75%,162.3,157.75,7.19,82.07,54.12,29.49,3.93,1678.0,201609.0,2105.0,3949.9,9.0,1987.0,8307.85,1.193997
max,5723.5,1856.56,5173.2,116.99,90.99,62.71,32.98,3162.0,201709.0,2117.0,18801.1,17.0,2015.0,32030.92,20.010833


### 对类别变量进行编码 category encoders
分类变量包括 K，L，O，P

In [5]:
M1 = i3e_data["М1, t"]
M2 = i3e_data["М2, t"]
delta_M = i3e_data["ΔМ, t"]
T1 = i3e_data["Т1, °C"]
T2 = i3e_data["Т2, °C"]
delta_T = i3e_data["ΔТ, °C"]
Gcal = i3e_data["Q, Gcal"]

area = i3e_data["area"]
year_of_construction = i3e_data["year of construction"]
area_of_building = i3e_data["area of building"]
temp = i3e_data["temp,˚C"]
type = i3e_data["type"]

registrated = i3e_data["registrated"]
floors = i3e_data["floors"]
walls_material = i3e_data["walls material"]
scheme = i3e_data["scheme"]


**没有缺失值，接下来看看有没有异常值**

In [6]:
Threshold_M1 = 4000
M1[M1 > Threshold_M1].describe()

count       1.0
mean     5723.5
std         NaN
min      5723.5
25%      5723.5
50%      5723.5
75%      5723.5
max      5723.5
Name: М1, t, dtype: float64

In [7]:
Threshold_M1=4000
i3e_data = i3e_data.drop(i3e_data[i3e_data["М1, t"] > Threshold_M1].index)
i3e_data.describe()

Unnamed: 0,"М1, t","М2, t","ΔМ, t","Т1, °C","Т2, °C","ΔТ, °C","Q, Gcal",USPD,YYYYMM,type,area,floors,year of construction,area of building,"temp,˚C"
count,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0,700035.0
mean,140.587278,136.26893,4.31835,74.912492,50.37991,24.532584,3.280698,1117.609808,201515.847422,1652.285836,3438.807182,6.517838,1978.607145,7116.121383,-5.944104
std,116.772316,115.806372,8.388738,10.712308,6.081113,7.725025,2.549725,794.009673,114.373084,474.922187,2307.934484,2.331795,12.961507,6380.410414,10.097461
min,0.26,0.01,-26.4,40.01,6.38,2.0,0.01,1.0,201312.0,1101.0,121.8,1.0,1882.0,121.8,-33.521342
25%,74.02,70.52,-0.16,66.52,46.3,19.31,1.75,467.0,201405.0,1105.0,2128.8,5.0,1970.0,3400.0,-12.586472
50%,112.55,107.2,2.21,73.02,49.88,23.92,2.65,987.0,201504.0,2005.0,2896.92,5.0,1979.0,4767.0,-5.182468
75%,162.3,157.75,7.19,82.07,54.12,29.49,3.93,1678.0,201609.0,2105.0,3949.9,9.0,1987.0,8307.85,1.193997
max,1875.7,1856.56,651.69,116.99,90.99,62.71,32.98,3162.0,201709.0,2117.0,18801.1,17.0,2015.0,32030.92,20.010833
