In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

ModuleNotFoundError: No module named 'pandas'

**Подготовка датасета**

In [5]:
data = pd.read_csv("train.csv", index_col=0)
data['hour'] = pd.to_datetime(data['DateTime']).dt.hour
data['month'] = pd.to_datetime(data['DateTime']).dt.month
data['day_of_week'] = pd.to_datetime(data['DateTime']).dt.weekday
data['day_of_year'] = pd.to_datetime(data['DateTime']).dt.dayofyear

In [2]:
holidays_days = [1, 2, 96, 97, 100, 121, 129, 138, 139, 149, 202, 227, 305, 306, 358, 359, 360, 361, 362, 363, 364, 365]

Проверем, является ли этот день выходным или праздником(взято из википедии _[Public holidays in the European Union](https://en.wikipedia.org/wiki/Public_holidays_in_the_European_Union)_)

In [6]:
data['is_holiday'] = data['day_of_year'].apply(lambda x: 1 if x in holidays_days else 0)
data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x > 5 else 0)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35118 entries, 37934 to 15795
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   DateTime               35118 non-null  object 
 1   Temperature            35118 non-null  float64
 2   Humidity               35118 non-null  float64
 3   Wind Speed             35118 non-null  float64
 4   general diffuse flows  35118 non-null  float64
 5   diffuse flows          35118 non-null  float64
 6   Energy city            35118 non-null  float64
 7   hour                   35118 non-null  int32  
 8   month                  35118 non-null  int32  
 9   day_of_week            35118 non-null  int32  
 10  day_of_year            35118 non-null  int32  
 11  is_holiday             35118 non-null  int64  
 12  is_weekend             35118 non-null  int64  
dtypes: float64(6), int32(4), int64(2), object(1)
memory usage: 3.2+ MB


In [None]:
data.head()

In [8]:
train_x, train_y = data.drop(["DateTime", "Energy city"], axis=1), data['Energy city']

Делим датасет на train и test выборку

In [9]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y)

**Создаем функцию для оценивания модели**

In [12]:
def estimate_model(my_model):
    pred = my_model.predict(x_test)
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    r2 = r2_score(y_test, pred)
    score = my_model.score(x_test, y_test)
    local_score = my_model.score(x_train, y_train)
    print("Testing performance")
    print("RMSE: {:.2f}".format(rmse))
    print("R2: {:.2f}".format(r2))
    print("Score: {:.4f}".format(score))
    print("Local Score: {:.4f}".format(local_score))

    print("Best params: ", my_model.get_params())

**Создаем модель 1 - _RandomForestRegressor_**

In [14]:
model1 = RandomForestRegressor(max_depth=28, min_samples_leaf=1, min_samples_split=2, n_estimators=500, n_jobs=-1)

In [11]:
model1.fit(x_train, y_train)

In [13]:
estimate_model(model1)

Testing performance
RMSE: 2195.91
R2: 0.98
Score: 0.9844
Local Score: 0.9978
Best params:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 28, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


**Добавляем GridSearch для модели**

In [21]:
param_gird = {
    "criterion": ["squared_error"],
    "max_depth": [30,  28],
    "min_samples_split": [2, 3],
    "min_samples_leaf": [1, 2],
    "n_estimators": [500],
    "n_jobs": [-1]
}

In [22]:
grid_search = GridSearchCV(model1, param_gird, cv=10)
grid_search.fit(x_train, y_train)
model1 = grid_search.best_estimator_

In [23]:
estimate_model(model1)

Testing performance
RMSE: 2196.79
R2: 0.98
Score: 0.9844
Local Score: 0.9978
Best params:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 30, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


**Сохранить модель в файл**

In [16]:
import pickle

In [38]:
with open("mymodel_2.pkl", "wb") as file:
    pickle.dump(model1, file)

**Создаем модель 2 - _GradientBoostingRegressor_**

In [27]:
model2 = GradientBoostingRegressor()

In [21]:
model2.fit(x_train, y_train)

**Делаем GridSearchCV**

In [46]:
param_grid2 = {
    "learning_rate": [0.6],
    "n_estimators": [200],
    "max_depth": [8, 5],
    "min_samples_split": [6, 7],
    "min_samples_leaf": [2,4],
}

In [47]:
grid_search = GridSearchCV(model2, param_grid2, cv=5)
grid_search.fit(x_train, y_train)
model2 = grid_search.best_estimator_

In [48]:
estimate_model(model2)

Testing performance
RMSE: 3143.33
R2: 0.97
Score: 0.9682
Local Score: 1.0000
Best params:  {'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'init': None, 'learning_rate': 0.6, 'loss': 'squared_error', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 7, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 300, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


**Создаем модель 3 - _KNeighborsRegressor_**

In [67]:
model3 = KNeighborsRegressor(n_neighbors=8)

In [68]:
model3.fit(x_train, y_train)

In [69]:
estimate_model(model3)

Testing performance
RMSE: 6806.06
R2: 0.85
Score: 0.8510
Local Score: 0.8890
Best params:  {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 8, 'p': 2, 'weights': 'uniform'}


**Добавим RandomSearchCv**

In [76]:
param_random={
    'leaf_size':list(np.arange(2, 90, step=8)) ,
    'n_neighbors':np.arange(5, 300, step=50),
    'algorithm':['ball_tree','kd_tree'],
    'weights':["uniform", "distance"],
    "n_jobs": [-1]
}


In [77]:
random_search = RandomizedSearchCV(model3, param_random, n_iter=500, cv=5)
model_random = random_search.fit(x_train, y_train)



In [78]:
random_search.best_estimator_

In [81]:
estimate_model(model_random)

Testing performance
RMSE: 6381.30
R2: 0.87
Score: 0.8690
Local Score: 1.0000
Best params:  {'cv': 5, 'error_score': nan, 'estimator__algorithm': 'auto', 'estimator__leaf_size': 30, 'estimator__metric': 'minkowski', 'estimator__metric_params': None, 'estimator__n_jobs': None, 'estimator__n_neighbors': 8, 'estimator__p': 2, 'estimator__weights': 'uniform', 'estimator': KNeighborsRegressor(n_neighbors=8), 'n_iter': 500, 'n_jobs': None, 'param_distributions': {'leaf_size': [2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82], 'n_neighbors': array([  5,  55, 105, 155, 205, 255]), 'algorithm': ['ball_tree', 'kd_tree'], 'weights': ['uniform', 'distance'], 'n_jobs': [-1]}, 'pre_dispatch': '2*n_jobs', 'random_state': None, 'refit': True, 'return_train_score': False, 'scoring': None, 'verbose': 0}


**Создаем модель 4 - _SVR_**

In [61]:
model4 = SVR(kernel='linear', degree=5, epsilon=0.2)

In [62]:
model4.fit(x_train, y_train)

In [63]:
estimate_model(model4)

Testing performance
RMSE: 10942.85
R2: 0.61
Score: 0.6148
Local Score: 0.6211
Best params:  {'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 5, 'epsilon': 0.2, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


**Сохраняем результат**

In [24]:
test_df = pd.read_csv("test.csv", index_col=0)
test_df['hour'] = pd.to_datetime(test_df['DateTime']).dt.hour
test_df['month'] = pd.to_datetime(test_df['DateTime']).dt.month
test_df['day_of_week'] = pd.to_datetime(test_df['DateTime']).dt.weekday
test_df['day_of_year'] = pd.to_datetime(test_df['DateTime']).dt.dayofyear
test_df['is_holiday'] = test_df['day_of_year'].apply(lambda x: 1 if x in holidays_days else 0)
test_df['is_weekend'] = test_df['day_of_week'].apply(lambda x: 1 if x > 5 else 0)

In [25]:
test_df

Unnamed: 0,DateTime,Temperature,Humidity,Wind Speed,general diffuse flows,diffuse flows,Energy city,hour,month,day_of_week,day_of_year,is_holiday,is_weekend
15506,04/17/2021/16:20,25.5028,29.73,0.08755,615.500,664.800,0,16,4,5,107,0,0
51943,12/26/2021/17:10,18.4576,54.44,0.08755,19.120,19.340,0,17,12,6,360,1,1
212,01/01/2021/11:20,17.5924,57.23,0.07828,396.600,40.660,0,11,1,4,1,1,0
11210,03/18/2021/20:20,14.8526,72.30,0.08343,0.062,0.148,0,20,3,3,77,0,0
16307,04/23/2021/05:50,17.0774,80.60,0.08240,0.018,0.204,0,5,4,4,113,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45184,11/09/2021/18:40,22.0935,74.30,0.07622,0.095,0.067,0,18,11,1,313,0,0
50376,12/15/2021/20:00,16.9435,79.60,0.08755,0.066,0.130,0,20,12,2,349,0,0
15647,04/18/2021/15:50,22.5673,55.86,0.07725,716.000,123.600,0,15,4,6,108,0,1
11674,03/22/2021/01:40,14.9659,63.85,0.08137,0.066,0.137,0,1,3,0,81,0,0


In [27]:
prediction = model1.predict(test_df.drop(["DateTime", "Energy city"], axis=1))
test_df['Energy city'] = prediction

In [28]:
test_df.sort_index().drop(["hour", "month", "day_of_week", "day_of_year", "is_holiday", "is_weekend"], axis=1).to_csv("result_random_forest_newdata5.csv", index=True)

**Пробуем объединить результаты 2 моделей**

In [13]:
res1 = pd.read_csv("result_random_forest_3.csv", index_col=0)
res2 = pd.read_csv("result_random_forest_4.csv", index_col=0)

In [14]:
res1['Energy city'] = (res1['Energy city'] + res2['Energy city']) / 2

In [99]:
# res1['Energy city'] = res1['Energy city'] * 1.03 DO NOT DO LIKE THIS

In [15]:
res1.to_csv("result_unity_3.csv")