In [132]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'DateTime', 'Temperature', 'Humidity', 'Wind Speed',
       'general diffuse flows', 'diffuse flows', 'Energy city'],
      dtype='object')

In [4]:
df['hour'] = pd.to_datetime(df['DateTime']).dt.hour
df['month'] = pd.to_datetime(df['DateTime']).dt.month

In [5]:
train_x, train_y = df.drop(["Unnamed: 0", "DateTime", "Energy city"], axis=1), df['Energy city']

In [6]:
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y)

In [7]:
y_train

2446     100471.213947
23385     56325.176265
32921     87314.221258
9761      46363.581338
6027      75744.272520
             ...      
4913      68847.146999
29896     77279.314439
18667     66338.370488
30209     65771.891706
25416     74708.194078
Name: Energy city, Length: 26338, dtype: float64

In [8]:
model1 = LinearRegression()

In [42]:
model1.fit(x_train, y_train)

In [43]:
pred = model1.predict(x_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
score = model1.score(x_test, y_test)
local_score = model1.score(x_train, y_train)
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))
print("Score: {:.4f}".format(score))
print("Local Score: {:.4f}".format(local_score))

print("Best params: ", model1.get_params())

Testing performance
RMSE: 10752.70
R2: 0.63
Score: 0.6268
Local Score: 0.6273
Best params:  {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}


In [131]:
model2 = DecisionTreeRegressor()

In [104]:
param_grid = {
    'criterion': ['squared_error', 'poisson'],
    'max_depth': [None, 20],
    'min_samples_split': [ 5, 6 ],
    'min_samples_leaf': [5, 6, 7]
}

In [105]:
grid_search = GridSearchCV(model2, param_grid, cv=5)
grid_search.fit(x_train, y_train)
model2 = grid_search.best_estimator_

In [129]:
model2.fit(x_train, y_train)

In [130]:
pred = model2.predict(x_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
score = model2.score(x_test, y_test)
local_score = model2.score(x_train, y_train)
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))
print("Score: {:.4f}".format(score))
print("Local Score: {:.4f}".format(local_score))

print("Best params: ", model2.get_params())

Testing performance
RMSE: 4320.05
R2: 0.94
Score: 0.9409
Local Score: 0.9980
Best params:  {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}


In [118]:
model3 = RandomForestRegressor()

In [112]:
param_grid = {
    'criterion': ['squared_error', 'poisson'],
    'max_depth': [None, 20],
    'min_samples_split': [ 5, 6 ],
    'min_samples_leaf': [5, 6, 7]
}

In [113]:
grid_search = GridSearchCV(model3, param_grid, cv=5)
grid_search.fit(x_train, y_train)
model3 = grid_search.best_estimator_

In [119]:
model3.fit(x_train, y_train)

In [120]:
pred = model3.predict(x_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
score = model3.score(x_test, y_test)
local_score = model3.score(x_train, y_train)
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))
print("Score: {:.4f}".format(score))
print("Local Score: {:.4f}".format(local_score))

print("Best params: ", model3.get_params())

Testing performance
RMSE: 3227.82
R2: 0.97
Score: 0.9670
Local Score: 0.9957
Best params:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [None]:
model4 = KNeighborsRegressor()

### Save results

In [27]:
test_df = pd.read_csv("test.csv", index_col=0)

In [28]:
test_df['hour'] = pd.to_datetime(test_df['DateTime']).dt.hour
test_df['month'] = pd.to_datetime(test_df['DateTime']).dt.month

Unnamed: 0,DateTime,Temperature,Humidity,Wind Speed,general diffuse flows,diffuse flows,Energy city,hour,month
15506,04/17/2021/16:20,25.5028,29.73,0.08755,615.500,664.800,0,16,4
51943,12/26/2021/17:10,18.4576,54.44,0.08755,19.120,19.340,0,17,12
212,01/01/2021/11:20,17.5924,57.23,0.07828,396.600,40.660,0,11,1
11210,03/18/2021/20:20,14.8526,72.30,0.08343,0.062,0.148,0,20,3
16307,04/23/2021/05:50,17.0774,80.60,0.08240,0.018,0.204,0,5,4
...,...,...,...,...,...,...,...,...,...
45184,11/09/2021/18:40,22.0935,74.30,0.07622,0.095,0.067,0,18,11
50376,12/15/2021/20:00,16.9435,79.60,0.08755,0.066,0.130,0,20,12
15647,04/18/2021/15:50,22.5673,55.86,0.07725,716.000,123.600,0,15,4
11674,03/22/2021/01:40,14.9659,63.85,0.08137,0.066,0.137,0,1,3


In [121]:
y_pred = model3.predict(test_df.drop(["DateTime", "Energy city"], axis=1))

In [122]:
test_df['Energy city'] = y_pred

In [76]:
test_df.drop(["hour", "month"], axis=1).sort_index().head()

Unnamed: 0,DateTime,Temperature,Humidity,Wind Speed,general diffuse flows,diffuse flows,Energy city
0,12/31/2020/00:00,8.81577,73.8,0.08549,0.051,0.119,57606.787215
1,12/31/2020/00:10,8.66642,74.5,0.08549,0.07,0.085,57606.787215
4,12/31/2020/00:40,8.15863,75.7,0.08343,0.048,0.085,57606.787215
6,12/31/2020/01:00,7.87023,77.7,0.0824,0.048,0.096,57240.460406
7,12/31/2020/01:10,7.72088,78.2,0.08755,0.055,0.093,57240.460406


In [123]:
test_df.sort_index().drop(["hour", "month"], axis=1).to_csv("result21.csv", index=True)

In [85]:
test_df.sort_index()

In [78]:
test_df.columns()