In [1]:
import pandas as pd
import pickle
import catboost as ctb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import json

In [2]:
df_train = pd.read_csv('./data/df_train.csv')

In [3]:
# impute the mode
df_train['Valencia_pressure'] = df_train['Valencia_pressure'].fillna(df_train['Valencia_pressure'].mode()[0])

    # extracting the number from the string 
df_train['Valencia_wind_deg'] = df_train['Valencia_wind_deg'].str.extract('(\d+)').astype('int64')

    # change the test data type to integer
df_train['Valencia_wind_deg'] = pd.to_numeric(df_train['Valencia_wind_deg'])
 

    # extracting the number from the string 
df_train['Seville_pressure'] = df_train['Seville_pressure'].str.extract('(\d+)').astype('int64')
 
    # change the data type to integer
df_train['Seville_pressure'] = pd.to_numeric(df_train['Seville_pressure'])
 
df_train['Year']  = df_train['time'].astype('datetime64').dt.year
df_train['Month_of_year']  = df_train['time'].astype('datetime64').dt.month
df_train['Week_of_year'] = df_train['time'].astype('datetime64').dt.weekofyear
df_train['Day_of_year']  = df_train['time'].astype('datetime64').dt.dayofyear
df_train['Day_of_month']  = df_train['time'].astype('datetime64').dt.day
df_train['Day_of_week'] = df_train['time'].astype('datetime64').dt.dayofweek
df_train['Hour_of_week'] = ((df_train['time'].astype('datetime64').dt.dayofweek) * 24 + 24) - (24 - df_train['time'].astype('datetime64').dt.hour)
df_train['Hour_of_day']  = df_train['time'].astype('datetime64').dt.hour

df_train = df_train.drop(columns=['Week_of_year','Day_of_year','Hour_of_week', 'Unnamed: 0','time'])

y_train = df_train[['load_shortfall_3h']]
X_train = df_train[:len(df_train)].drop('load_shortfall_3h', axis=1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_scaled = pd.DataFrame(X_scaled,columns=X_train.columns)

predict_vector = X_scaled[['Madrid_wind_speed', 'Valencia_wind_deg', 'Bilbao_rain_1h',
       'Valencia_wind_speed', 'Seville_humidity', 'Madrid_humidity',
       'Bilbao_clouds_all', 'Bilbao_wind_speed', 'Seville_clouds_all',
       'Bilbao_wind_deg', 'Barcelona_wind_speed', 'Barcelona_wind_deg',
       'Madrid_clouds_all', 'Seville_wind_speed', 'Barcelona_rain_1h',
       'Seville_pressure', 'Seville_rain_1h', 'Bilbao_snow_3h',
       'Barcelona_pressure', 'Seville_rain_3h', 'Madrid_rain_1h',
       'Barcelona_rain_3h', 'Valencia_snow_3h', 'Madrid_weather_id',
       'Barcelona_weather_id', 'Bilbao_pressure', 'Seville_weather_id',
       'Valencia_pressure', 'Seville_temp_max', 'Madrid_pressure',
       'Valencia_temp_max', 'Valencia_temp', 'Bilbao_weather_id',
       'Seville_temp', 'Valencia_humidity', 'Valencia_temp_min',
       'Barcelona_temp_max', 'Madrid_temp_max', 'Barcelona_temp',
       'Bilbao_temp_min', 'Bilbao_temp', 'Barcelona_temp_min',
       'Bilbao_temp_max', 'Seville_temp_min', 'Madrid_temp', 'Madrid_temp_min',
       'Year', 'Month_of_year', 'Day_of_month', 'Day_of_week', 'Hour_of_day']]

  df_train['Week_of_year'] = df_train['time'].astype('datetime64').dt.weekofyear


In [13]:
model_CBR = ctb.CatBoostRegressor()
model_CBR.fit(predict_vector, y_train)

Learning rate set to 0.057692
0:	learn: 5111.1638732	total: 54ms	remaining: 53.9s
1:	learn: 5002.1564203	total: 90.3ms	remaining: 45.1s
2:	learn: 4924.3554974	total: 136ms	remaining: 45.3s
3:	learn: 4857.4934490	total: 174ms	remaining: 43.4s
4:	learn: 4767.9239154	total: 254ms	remaining: 50.6s
5:	learn: 4700.2430403	total: 300ms	remaining: 49.7s
6:	learn: 4639.2195803	total: 340ms	remaining: 48.2s
7:	learn: 4567.8484653	total: 378ms	remaining: 46.8s
8:	learn: 4501.0539436	total: 461ms	remaining: 50.7s
9:	learn: 4454.5381809	total: 513ms	remaining: 50.8s
10:	learn: 4406.6061424	total: 554ms	remaining: 49.8s
11:	learn: 4368.2016326	total: 620ms	remaining: 51s
12:	learn: 4327.2029532	total: 671ms	remaining: 50.9s
13:	learn: 4294.9719434	total: 717ms	remaining: 50.5s
14:	learn: 4258.5380849	total: 763ms	remaining: 50.1s
15:	learn: 4218.0378096	total: 803ms	remaining: 49.4s
16:	learn: 4183.9394885	total: 875ms	remaining: 50.6s
17:	learn: 4146.9860173	total: 919ms	remaining: 50.1s
18:	learn:

<catboost.core.CatBoostRegressor at 0x226ea7fb4c0>

In [1]:
# Pickle model for use within our API
save_path = '../assets/trained-models/model_CBR2.pkl'
print (f"Training completed. Saving model to: {save_path}")
pickle.dump(model_CBR, open(save_path,'wb'))

Training completed. Saving model to: ../assets/trained-models/model_CBR2.pkl


NameError: name 'pickle' is not defined