In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

#import sys
#sys.path.append('..')
#from utils import get_train_data, create_preprocessor, submit_test, train_test_split_temporal, get_feature_lists

#X, y = get_train_data()
data = pd.read_parquet("../data/train.parquet")
_target_column_name = "log_bike_count"

data['date'] = pd.to_datetime(data['date'], dayfirst=True)  # Ensure the date format is parsed correctly

y = data[_target_column_name].values
X = data.drop([_target_column_name, "bike_count"], axis=1)

In [2]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    """
    Split the data into training and validation sets based on a temporal cutoff.
    Args:
        X (pd.DataFrame): Features with a `date` column.
        y (pd.Series): Target variable.
        delta_threshold (str): Time delta defining the validation cutoff.
    Returns:
        Tuple: X_train, y_train, X_valid, y_valid
    """
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]
    return X_train, X_valid, y_train, y_valid

In [3]:
from jours_feries_france import JoursFeries
import os

def add_arrondissement(df):
    """
    Adds district information to the DataFrame based on a predefined dictionary.
    """
    district_mapping = {
        '28 boulevard Diderot': 12,
        '39 quai François Mauriac': 13,
        "18 quai de l'Hôtel de Ville": 4,
        'Voie Georges Pompidou': 4,
        '67 boulevard Voltaire SE-NO': 11,
        'Face au 48 quai de la marne': 19,
        "Face 104 rue d'Aubervilliers": 19,
        'Face au 70 quai de Bercy': 12,
        '6 rue Julia Bartet': 16,
        "Face au 25 quai de l'Oise": 19,
        '152 boulevard du Montparnasse': 14,
        'Totem 64 Rue de Rivoli': 1,
        'Pont des Invalides S-N': 7,
        'Pont de la Concorde S-N': 7,
        'Pont des Invalides N-S': 7,
        'Face au 8 avenue de la porte de Charenton': 12,
        'Face au 4 avenue de la porte de Bagnolet': 20,
        'Pont Charles De Gaulle': 13,
        '36 quai de Grenelle': 15,
        "Face au 40 quai D'Issy": 15,
        'Pont de Bercy': 12,
        '38 rue Turbigo': 3,
        "Quai d'Orsay": 7,
        '27 quai de la Tournelle': 5,
        "Totem 85 quai d'Austerlitz": 13,
        'Totem Cours la Reine': 8,
        'Totem 73 boulevard de Sébastopol': 1,
        '90 Rue De Sèvres': 7,
        '20 Avenue de Clichy': 17,
        '254 rue de Vaugirard': 15
    }
    # Apply the district mapping
    df = df.copy()
    df['arrondissement'] = df['site_name'].map(district_mapping)
    
    return df


# Import the bank holidays in France for 2020 and 2021
holidays_2020 = JoursFeries.for_year(2020)
holidays_2021 = JoursFeries.for_year(2021)

# Create lists of dates from each dictionary
dates_2020 = list(holidays_2020.values())
dates_2021 = list(holidays_2021.values())

# Create DataFrame with all dates
all_dates = dates_2020 + dates_2021
bank_holidays_df = pd.DataFrame(all_dates, columns=["date"])
bank_holidays_df["date"] = pd.to_datetime(bank_holidays_df["date"])

# Add a new column "is_bank_holiday" to the data dataframe
def is_holidays(df):
    df["is_bank_holiday"] = df["date"].dt.date.isin(bank_holidays_df["date"].dt.date).astype(int)
    
    return df

# Dividing a day into 4 relevant sections
def assign_time_interval(hour):
    if 5 <= hour < 9:
        return 'morning'
    elif 9 <= hour < 15:
        return 'working_hours'
    elif 15 <= hour < 20:
        return 'peak_hours'
    else:
        return 'calm'

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    X['is_weekend'] = X['weekday'].apply(lambda x: 1 if x >= 5 else 0)
    
    X['season'] = X['month'] % 12 // 3 # Winter=0, Spring=1, Summer=2, Fall=3
    X['time_interval'] = X['hour'].apply(assign_time_interval)

    # Cyclical encoding
    X['hour_sin'] = np.sin(2 * np.pi * X['hour']/24)
    X['hour_cos'] = np.cos(2 * np.pi * X['hour']/24)
    X['day_sin'] = np.sin(2 * np.pi * X['weekday']/7)
    X['day_cos'] = np.cos(2 * np.pi * X['weekday']/7)

    # One-hot encoding time_interval
    X = pd.get_dummies(X, columns=['time_interval'], prefix='time')
    
    # One-hot encoding for day_of_week. Season one-hot-encoding is extremely bad for the model!!!
    #X = pd.get_dummies(X, columns=['weekday', 'season'], prefix=['day', 'season']) 
    X = pd.get_dummies(X, columns=['weekday'], prefix=['day'])
    
    # Finally we can drop the original columns from the dataframe
    #X = X.drop(columns=["date"])
    
    return X


In [4]:
X = _encode_dates(X)
X = is_holidays(X)
X = add_arrondissement(X)

X = X.drop(columns=["coordinates", "counter_name", "site_name",
                              "counter_installation_date","counter_technical_id",
                              "counter_id"])


In [5]:

X_train, X_valid, y_train, y_valid = train_test_split_temporal(X, y)
X_train = X_train.drop(columns=["date"])
X_valid = X_valid.drop(columns=["date"])

## XGBoost Gridsearch, CV on GPU

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

# Define the model with GPU support
model = XGBRegressor(tree_method='hist', device='cuda', random_state=42, verbosity=1)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 300],
    'learning_rate': [0.01, 0.1], 
    'max_depth': [3, 7],
    'subsample': [0.8, 1.0]
}

# Perform GridSearch with verbose progress
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=4,  # 4-fold cross-validation
    scoring='neg_mean_squared_error',  # Optimize MSE
    verbose=2,  # Verbose for GridSearchCV
    n_jobs=-1  # Use all available cores for the CPU part of the grid search
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_model.predict(X_valid)
test_mse = mean_squared_error(y_valid, y_pred)

# Output results
print("Best Parameters:", best_params)
print("Test MSE:", test_mse)


Fitting 4 folds for each of 16 candidates, totalling 64 fits



    E.g. tree_method = "hist", device = "cuda"



Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 1.0}
Test MSE: 0.3465580611026357



    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 1.0}
Test MSE: 0.3465580611026357

In [20]:
np.sqrt(0.34656)

0.5886934686235273

In [19]:
# Submission
file_name = "../submissions/" + "XGBRegressor_gridsearch_optimum" + "_submission.csv"

X_test = pd.read_parquet("../data/final_test.parquet")

X_test = _encode_dates(X_test)
X_test = is_holidays(X_test)
X_test = add_arrondissement(X_test)

X_test = X_test.drop(columns=["coordinates", "counter_name", "site_name",
                              "counter_installation_date","counter_technical_id",
                              "counter_id", "date"])


y_predict = best_model.predict(X_test)

results = pd.DataFrame(
dict(
    Id=np.arange(y_predict.shape[0]),
    log_bike_count=y_predict,
    )
)
results.to_csv(file_name, index=False)

## LightGBM with GridCearch, CV on GPU

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import pandas as pd

# Define the model with GPU support
model = LGBMRegressor(boosting_type='gbdt', device='gpu', random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 7]
}

# Perform GridSearchCV with verbose progress
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=4,  # 4-fold cross-validation
    scoring='neg_mean_squared_error',  # Optimize MSE
    verbose=2,  # Verbose for GridSearchCV
    n_jobs=-1  # Use all available cores for CPU tasks
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_model.predict(X_valid)
test_mse = mean_squared_error(y_valid, y_pred)

# Output results
print("Best Parameters:", best_params)
print("Test MSE:", test_mse)


Fitting 4 folds for each of 8 candidates, totalling 32 fits


In [None]:
# Submission
file_name = "../submissions/" + "lightgbm_gridsearch_optimum" + "_submission.csv"x

X_test = pd.read_parquet("../data/final_test.parquet")

X_test = _encode_dates(X_test)
X_test = is_holidays(X_test)
X_test = add_arrondissement(X_test)

X_test = X_test.drop(columns=["coordinates", "counter_name", "site_name",
                              "counter_installation_date","counter_technical_id",
                              "counter_id", "date"])


y_predict = best_model.predict(X_test)

results = pd.DataFrame(
dict(
    Id=np.arange(y_predict.shape[0]),
    log_bike_count=y_predict,
    )
)
results.to_csv(file_name, index=False)

## Deep Learning

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# Define the features to scale
scale_features = ['latitude', 'longitude', 'year', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos']

# Copy the data to avoid altering the original
X_train_scaled = X_train.copy()
X_valid_scaled = X_valid.copy()

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the necessary features in X_train
X_train_scaled[scale_features] = scaler.fit_transform(X_train[scale_features])

# Transform the same features in X_test
X_valid_scaled[scale_features] = scaler.transform(X_valid[scale_features])

# Define the deep learning model
model = Sequential([
    Dense(128, activation='relu', input_dim=X_train_scaled.shape[1]),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1)  # Output layer for regression
])

model.compile(optimizer='adam', loss='mse', metrics=['mse'])

# Train the model
history = model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=50, batch_size=512, verbose=1, shuffle=False)

# Evaluate on the test set
y_pred = model.predict(X_valid_scaled).flatten()
test_mse = mean_squared_error(y_valid, y_pred)
test_rmse = np.sqrt(test_mse)

print("Test RMSE:", test_rmse)


ModuleNotFoundError: No module named 'tensorflow'

In [8]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


ModuleNotFoundError: No module named 'tensorflow'