Import some necessary functions and libraries

In [1]:
from catboost_model import Catboost_Model
import plotly.express as px
import pandas as pd
from utils.submit_submission import submit_submission
import main as utils
pd.options.plotting.backend = "plotly"

Define the targets we want to predict

In [2]:
targets = ["Available", "Charging", "Passive", "Other"]

Define the features we can use to make our predictions

In [3]:
station_features = ['Station', 'tod', 'dow', 'area'] + \
                    ['trend', 'Latitude', 'Longitude']  # temporal and spatial inputs

area_features = ['area', 'tod', 'dow'] + ['trend',
                                            'Latitude', 'Longitude']  # temporal and spatial inputs

global_features = ['tod', 'dow'] + ['trend']  # temporal input

Import the training data from the csv input file

In [4]:

train_station_raw = utils.import_data("train")

Format the test dataset columns to the expected types

In [8]:

train_station = utils.format_data(train_station_raw)

Some information on the content of the dataset

In [6]:
train_station.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1851083 entries, 0 to 1851082
Data columns (total 13 columns):
 #   Column     Dtype         
---  ------     -----         
 0   date       datetime64[ns]
 1   Station    object        
 2   Available  int64         
 3   Charging   int64         
 4   Passive    int64         
 5   Other      int64         
 6   tod        int64         
 7   dow        int64         
 8   trend      int64         
 9   Latitude   float64       
 10  Longitude  float64       
 11  Postcode   object        
 12  area       object        
dtypes: datetime64[ns](1), float64(2), int64(7), object(3)
memory usage: 183.6+ MB


Sum of each state per station per day

In [78]:
aggregated_per_day = train_station.groupby('date', as_index=False).agg({'Available':'sum','Charging':'sum','Passive':'sum', 'Other':'sum'})
fig = px.line(aggregated_per_day, y = ['Available', 'Charging', 'Passive', 'Other'])
fig.update_layout(title="Sum of each state per station per day", xaxis_title="Date", yaxis_title="Sum of values",legend_title="State")
fig.show()

Total number of values per station per day (lighter cells show missing values)

In [72]:
aggregated_per_day_per_station = train_station.groupby([pd.Grouper(key='date',freq='D'),'Station']).agg({'Available':'sum','Charging':'sum','Passive':'sum', 'Other':'sum'}).reset_index()
aggregated_per_day_per_station["Total"] = aggregated_per_day_per_station[['Available', 'Charging', 'Passive', 'Other']].sum(1)
aggregated_per_day_per_station_pivot = aggregated_per_day_per_station.pivot(index='Station', columns='date')['Total'].fillna(0)
aggregated_per_day_per_station_pivot = aggregated_per_day_per_station_pivot.copy()
fig = px.imshow(aggregated_per_day_per_station_pivot, x=aggregated_per_day_per_station_pivot.columns, y=aggregated_per_day_per_station_pivot.index, color_continuous_scale='algae', labels={"color":"Number of values"})
fig.update_layout(height=1000, title="Total number of values per station per day", xaxis_title="Date", yaxis_title="Station")
fig.show()

Total number of values per day (dips show missing values)

In [62]:
total_values_per_station = aggregated_per_day_per_station.groupby('date').sum('Total')
fig = px.line(total_values_per_station, y = 'Total')
fig.update_layout(title="Total number of values per day", xaxis_title="Date", yaxis_title="Total number of values")
fig.show()

Generate area and global datasets from the training data at station level

In [15]:
train_area, train_global = utils.generate_area_and_global(train_station)

Extract the last 20% of the training datasets for validation

In [16]:
validation_station, validation_area, validation_global = utils.split_dataset(train_station, train_area,
                                                                        train_global, threshold=0.8)

Define the catboost models for each level of granularity

In [17]:
model_station = Catboost_Model(train_data=train_station,
                                test=validation_station,
                                features=station_features,
                                cat_features=[0, 1, 2, 3],
                                targets=targets,
                                learning_rate=0.1,
                                level_col="Station")

model_area = Catboost_Model(train_data=train_area,
                            test=validation_area,
                            features=area_features,
                            cat_features=[0, 1, 2],
                            targets=targets,
                            learning_rate=0.1,
                            level_col="area")

model_global = Catboost_Model(train_data=train_global,
                                test=validation_global,
                                features=global_features,
                                cat_features=[0, 1],
                                targets=targets,
                                learning_rate=0.1,
                                level_col="global")

Train the models

In [18]:
model_station.train()
model_area.train()
model_global.train()

==== Target  Available  ====
Iteration  1 / 4
0:	learn: 1.5594891	test: 1.5395729	best: 1.5395729 (0)	total: 1.05s	remaining: 5m 14s
1:	learn: 1.4780878	test: 1.4604973	best: 1.4604973 (1)	total: 1.81s	remaining: 4m 30s
2:	learn: 1.4019968	test: 1.3867712	best: 1.3867712 (2)	total: 2.56s	remaining: 4m 13s
3:	learn: 1.3332857	test: 1.3194329	best: 1.3194329 (3)	total: 3.43s	remaining: 4m 13s
4:	learn: 1.2726458	test: 1.2605044	best: 1.2605044 (4)	total: 4.36s	remaining: 4m 17s
5:	learn: 1.2186611	test: 1.2078241	best: 1.2078241 (5)	total: 5.11s	remaining: 4m 10s
6:	learn: 1.1728420	test: 1.1631891	best: 1.1631891 (6)	total: 5.98s	remaining: 4m 10s
7:	learn: 1.1341947	test: 1.1253997	best: 1.1253997 (7)	total: 6.74s	remaining: 4m 5s
8:	learn: 1.0970483	test: 1.0909001	best: 1.0909001 (8)	total: 7.47s	remaining: 4m 1s
9:	learn: 1.0595085	test: 1.0544108	best: 1.0544108 (9)	total: 8.15s	remaining: 3m 56s
10:	learn: 1.0248953	test: 1.0201242	best: 1.0201242 (10)	total: 8.8s	remaining: 3m 51

Import the test dataset

In [19]:
test_station_raw = utils.import_data("test")

Format the test dataset columns to the expected types

In [20]:

test_station = utils.format_data(test_station_raw)

Generate area and global datasets from the testing data at station level

In [21]:

test_area, test_global = utils.generate_area_and_global(test_station, is_test=True)

Make predictions on the test datasets with the trained models

In [22]:

prediction_station = model_station.predict(test_station)
prediction_area = model_area.predict(test_area)
prediction_global = model_global.predict(test_global)

==== Target  Available  ====
Iteration  1 / 4
==== Target  Charging  ====
Iteration  2 / 4
==== Target  Passive  ====
Iteration  3 / 4
==== Target  Other  ====
Iteration  4 / 4
==== Target  Available  ====
Iteration  1 / 4
==== Target  Charging  ====
Iteration  2 / 4
==== Target  Passive  ====
Iteration  3 / 4
==== Target  Other  ====
Iteration  4 / 4
==== Target  Available  ====
Iteration  1 / 4
==== Target  Charging  ====
Iteration  2 / 4
==== Target  Passive  ====
Iteration  3 / 4
==== Target  Other  ====
Iteration  4 / 4


Generate the csv files containing the predictions

In [23]:

submit_submission(prediction_station, prediction_area, prediction_global, targets)


{'global':       tod  dow  trend                date  Available  Charging  Passive  Other
 0       0    6  22203 2021-02-19 00:00:00      136.0      26.0     14.0   79.0
 1       1    6  22204 2021-02-19 00:15:00      136.0      26.0     14.0   82.0
 2       2    6  22205 2021-02-19 00:30:00      136.0      26.0     15.0   89.0
 3       3    6  22206 2021-02-19 00:45:00      136.0      27.0     15.0   85.0
 4       4    6  22207 2021-02-19 01:00:00      136.0      26.0     16.0   84.0
 ...   ...  ...    ...                 ...        ...       ...      ...    ...
 1819   91    4  24118 2021-03-10 22:45:00      138.0      24.0     12.0   97.0
 1820   92    4  24119 2021-03-10 23:00:00      138.0      24.0     12.0   98.0
 1821   93    4  24120 2021-03-10 23:15:00      138.0      23.0     12.0   97.0
 1822   94    4  24121 2021-03-10 23:30:00      138.0      24.0     12.0   95.0
 1823   95    4  24122 2021-03-10 23:45:00      138.0      24.0     12.0   99.0
 
 [1824 rows x 8 columns],
 '