## First model

In [72]:
import os
import re
import ast
import typing
import requests
import icalendar
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib as mpl
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from datetime import date, datetime
from icalendar import Calendar, Event, vCalAddress, vText

import xgboost as xgb
from xgboost import plot_importance

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder

%matplotlib inline

np.random.seed(31415)

sns.set(rc={'figure.figsize':(15,3)})
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [73]:
df = pd.read_csv('bicing_data_cleaned_nur.csv')
df.head()

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
0,1.0,2020.0,1.0,1.0,1.0,0.394444,0.459259,0.492593,0.511111,35559710.0
1,1.0,2020.0,1.0,1.0,6.0,0.233333,0.298148,0.364815,0.283333,0.3462963
2,1.0,2020.0,1.0,1.0,11.0,0.337037,0.342593,0.262963,0.248148,0.2351852
3,1.0,2020.0,1.0,1.0,16.0,0.211111,0.244444,0.335185,0.381481,0.3851852
4,1.0,2020.0,1.0,1.0,21.0,0.492593,0.418519,0.209259,0.07963,0.1777778


In [74]:
df.shape

(1081584, 10)

In [75]:
df.year.unique()

array([2020., 2021., 2022., 2023.])

In [76]:
df.month.unique()

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.])

In [77]:
df.station_id.nunique()

399

In [78]:
test = pd.read_csv('metadata_sample_submission_2024.csv')
test.head()

Unnamed: 0,index,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1
0,0,1,1,1,5,0.781481,0.677778,0.696296,0.75
1,1,1,1,1,10,0.737374,0.711111,0.711111,0.731624
2,2,1,1,1,15,0.827778,0.896296,0.901852,0.883333
3,3,1,1,1,20,0.825926,0.874074,0.927778,0.918519
4,4,2,1,1,3,0.592593,0.341954,0.275862,0.54023


Let's order the columns like in the test csv file.

In [79]:
# Reorder the columns as specified
new_column_order = ['station_id', 'year', 'month', 'day', 'hour', 
                    'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1', 'percentage_docks_available']

# Apply the new column order
df = df[new_column_order]

# Display the first few rows of the reordered DataFrame
df.head()

Unnamed: 0,station_id,year,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,percentage_docks_available
0,1.0,2020.0,1.0,1.0,1.0,35559710.0,0.511111,0.492593,0.459259,0.394444
1,1.0,2020.0,1.0,1.0,6.0,0.3462963,0.283333,0.364815,0.298148,0.233333
2,1.0,2020.0,1.0,1.0,11.0,0.2351852,0.248148,0.262963,0.342593,0.337037
3,1.0,2020.0,1.0,1.0,16.0,0.3851852,0.381481,0.335185,0.244444,0.211111
4,1.0,2020.0,1.0,1.0,21.0,0.1777778,0.07963,0.209259,0.418519,0.492593


### Train, Validation and Test

Now let's split our datafram (df) into train and validation datasets. The test dataframe is the one provided in Kaggle.

In [80]:
# FUNCTION TO SPLIT TRAIN AND VALIDATION 80/20
def train_validation_split(df):
    # Order by time (Year, Month, Day, Hour, station_id) -- THIS STEP IS VERY IMPORTANT
    df_to_split = df.sort_values(by=['year', 'month', 'day', 'hour', 'station_id']).reset_index(drop=True)
    # Calculate the index that divide the dataframe by 80/20
    train_size = int(len(df_to_split) * 0.8)
    # Create column named "index"
    df_to_split['index'] = df_to_split.index
    # Get train and validation dataframes
    train = df_to_split.iloc[:train_size]
    validation = df_to_split.iloc[train_size:]
    
    # Reorder again by station_id first
    train = train.sort_values(by=['station_id','year', 'month', 'day', 'hour']).reset_index(drop=True)
    validation = validation.sort_values(by=['station_id','year', 'month', 'day', 'hour']).reset_index(drop=True)

    return train, validation

In [81]:
train, validation = train_validation_split(df)

In [82]:
train.head()

Unnamed: 0,station_id,year,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,percentage_docks_available,index
0,1.0,2020.0,1.0,1.0,1.0,35559710.0,0.511111,0.492593,0.459259,0.394444,0
1,1.0,2020.0,1.0,1.0,6.0,0.3462963,0.283333,0.364815,0.298148,0.233333,394
2,1.0,2020.0,1.0,1.0,11.0,0.2351852,0.248148,0.262963,0.342593,0.337037,788
3,1.0,2020.0,1.0,1.0,16.0,0.3851852,0.381481,0.335185,0.244444,0.211111,1182
4,1.0,2020.0,1.0,1.0,21.0,0.1777778,0.07963,0.209259,0.418519,0.492593,1576


In [83]:
train.shape

(865267, 11)

In [84]:
validation.head()

Unnamed: 0,station_id,year,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1,percentage_docks_available,index
0,1.0,2022.0,10.0,2.0,2.0,0.927778,0.883333,0.788889,0.709259,0.735185,865543
1,1.0,2022.0,10.0,2.0,7.0,0.75,0.764815,0.777778,0.803704,0.844444,865941
2,1.0,2022.0,10.0,2.0,12.0,0.901852,0.933333,0.972222,0.961111,0.972222,866339
3,1.0,2022.0,10.0,2.0,17.0,1.012963,0.998148,0.97963,0.935185,0.931481,866737
4,1.0,2022.0,10.0,2.0,22.0,0.987037,0.924074,0.825926,0.738889,0.67963,867135


In [85]:
validation.shape

(216317, 11)

### Linear Regression Model

In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming train_df and val_df are already defined

# Select features and target
features = ['station_id', 'year', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1']
target = 'percentage_docks_available'

X_train = train[features]
y_train = train[target]

X_val = validation[features]
y_val = validation[target]

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict using the validation set
y_pred = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Optionally, show the first few predictions vs actual values
comparison = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred})
print(comparison.head())

Mean Squared Error: 0.012318925801597042
R^2 Score: 0.8207323380543078
     Actual  Predicted
0  0.735185   0.683625
1  0.844444   0.785306
2  0.972222   0.916607
3  0.931481   0.887725
4  0.679630   0.704584


### First Kaggle submission

In [87]:
# Adding the column year
test['year'] = 2024.0

In [88]:
# Reorder the test dataframe:
new_column_order = ['index', 'station_id', 'year', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1']

# Apply the new column order
test = test[new_column_order]

# Display the first few rows of the reordered DataFrame
test.head()

Unnamed: 0,index,station_id,year,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1
0,0,1,2024.0,1,1,5,0.781481,0.677778,0.696296,0.75
1,1,1,2024.0,1,1,10,0.737374,0.711111,0.711111,0.731624
2,2,1,2024.0,1,1,15,0.827778,0.896296,0.901852,0.883333
3,3,1,2024.0,1,1,20,0.825926,0.874074,0.927778,0.918519
4,4,2,2024.0,1,1,3,0.592593,0.341954,0.275862,0.54023


In [89]:
# Define the features:
features = ['station_id', 'year', 'month', 'day', 'hour', 'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1']

In [90]:
test = test[features]
test.head()

Unnamed: 0,station_id,year,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1
0,1,2024.0,1,1,5,0.781481,0.677778,0.696296,0.75
1,1,2024.0,1,1,10,0.737374,0.711111,0.711111,0.731624
2,1,2024.0,1,1,15,0.827778,0.896296,0.901852,0.883333
3,1,2024.0,1,1,20,0.825926,0.874074,0.927778,0.918519
4,2,2024.0,1,1,3,0.592593,0.341954,0.275862,0.54023


In [91]:
# Convert all variables into floats:
test = test.astype(float)

In [92]:
test.head()

Unnamed: 0,station_id,year,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1
0,1.0,2024.0,1.0,1.0,5.0,0.781481,0.677778,0.696296,0.75
1,1.0,2024.0,1.0,1.0,10.0,0.737374,0.711111,0.711111,0.731624
2,1.0,2024.0,1.0,1.0,15.0,0.827778,0.896296,0.901852,0.883333
3,1.0,2024.0,1.0,1.0,20.0,0.825926,0.874074,0.927778,0.918519
4,2.0,2024.0,1.0,1.0,3.0,0.592593,0.341954,0.275862,0.54023


In [93]:
# Predict using the test set
y_pred_test = model.predict(test)

In [97]:
df_output = pd.DataFrame(y_pred_test)
df_output = df_output.reset_index()
df_output.columns = ['index','percentage_docks_available']

df_output.to_csv('Submission.csv',index=False)

In [98]:
df_output.head()

Unnamed: 0,index,percentage_docks_available
0,0,0.739611
1,1,0.71569
2,2,0.842407
3,3,0.872703
4,4,0.601065


In [99]:
df_output.shape

(171902, 2)