In [37]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import xgboost as xgb

In [38]:
# The date encoder stays the same, we changed nothing here
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

In [39]:
def _merge_external_data(X):
    df_ext = pd.read_csv("external_data.csv", parse_dates=["date"])

    X = X.copy()
    # When using merge_asof left frame need to be sorted
    X["orig_index"] = np.arange(X.shape[0])
    X = pd.merge_asof(
        X.sort_values("date"), df_ext[["date", "ww", "u", "etat_sol"]].sort_values("date"), on="date"
    )
    # Sort back to the original order
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X

In [40]:
from workalendar.europe import France

def add_new_features(df):
    # Create an instance of the France calendar
    cal = France()
    # Convert 'date' to datetime if it's not already
    df['date'] = pd.to_datetime(df['date'])

    # Adding 'is_weekend' feature
    df['is_weekend'] = df['date'].dt.weekday.isin([5, 6]).astype(int)

    # Adding 'is_holiday' feature
    df['is_holiday'] = df['date'].apply(lambda x: cal.is_holiday(x)).astype(int)
        
    # Define curfew periods
    curfew_periods = [
        (pd.to_datetime("2020-10-17"), pd.to_datetime("2020-12-15"), 21, 6),
        (pd.to_datetime("2020-12-15"), pd.to_datetime("2021-01-16"), 20, 6),
        (pd.to_datetime("2021-01-16"), pd.to_datetime("2021-05-19"), 18, 6),
        (pd.to_datetime("2021-05-19"), pd.to_datetime("2021-06-09"), 21, 6),
        (pd.to_datetime("2021-06-09"), pd.to_datetime("2021-06-30"), 23, 6)
        # No curfew after 2021-06-30
    ]

    # Function to check if a datetime is within the curfew period
    def is_curfew(date):
        hour = date.hour
        for start, end, start_hour, end_hour in curfew_periods:
            if start <= date <= end:
                if start_hour <= hour or hour < end_hour:  # Curfew hours
                    return 1
        return 0

    # Apply the function to each row
    df['is_curfew'] = df['date'].apply(is_curfew)

     
    # Adding cyclic encoding for day, month, and hour
    df['day_sin'] = np.sin(2 * np.pi * df['date'].dt.day / df['date'].dt.days_in_month)
    df['day_cos'] = np.cos(2 * np.pi * df['date'].dt.day / df['date'].dt.days_in_month)
    df['month_sin'] = np.sin(2 * np.pi * df['date'].dt.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['date'].dt.month / 12)
    df['hour_sin'] = np.sin(2 * np.pi * df['date'].dt.hour / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['date'].dt.hour / 24)

    return df

In [41]:
def get_estimator():
    # Function to add new features (place the function add_new_features here)

    # Pipeline components
    date_encoder = FunctionTransformer(_encode_dates)
    date_cols = ["year", "month", "day", "weekday", "hour"]

    # Update to include new features
    categorical_cols = [
        "counter_name", "site_name", "etat_sol", "ww", 
        "is_weekend", "is_holiday", "is_curfew"]
    numerical_cols = ["u", "day_sin", "day_cos", "month_sin", "month_cos", "hour_sin", "hour_cos"]
    
    preprocessor = ColumnTransformer(
        [
            ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
            ("num", 'passthrough', numerical_cols) 
        ]
    )

    # XGB Regressor parameters
    regressor = xgb.XGBRegressor(max_depth=8, objective='reg:squarederror', learning_rate=0.2, n_estimators=110)

    # Final pipeline
    pipe = make_pipeline(
        FunctionTransformer(add_new_features, validate=False),
        FunctionTransformer(_merge_external_data, validate=False),
        date_encoder,
        preprocessor,
        regressor
    )

    return pipe

In [42]:
data_train = pd.read_parquet("train.parquet")
data_test = pd.read_parquet("test.parquet")

data = pd.concat((data_train, data_test))

data['date'] = pd.to_datetime(data['date'])

# Define the lockdown start and end dates
lockdown_start = pd.to_datetime("2020-10-30")
lockdown_end = pd.to_datetime("2020-12-15")

# Filter out the data that falls within the lockdown period
data = data[~((data['date'] >= lockdown_start) & (data['date'] <= lockdown_end))]

X = data.drop(["bike_count", "log_bike_count"], axis=1)
y = data["log_bike_count"]

In [43]:
pipe = get_estimator()

In [44]:
pipe.fit(X, y)

In [45]:
X_final_test = pd.read_parquet("final_test.parquet")

In [46]:
y_pred = pipe.predict(X_final_test)

In [47]:
for i in range(len(y_pred)):
    if y_pred[i] < 0:
        y_pred[i] = 0

array([0.18651041, 1.4709659 , 2.1815455 , ..., 4.8654575 , 4.4252653 ,
       3.8077776 ], dtype=float32)

In [None]:
y_pred

In [49]:
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission_XGB_curfew.csv", index=False)

In [570]:
# Changer en fonction des meilleures submissions

csv1 = pd.read_csv("submission_XGB_full_merged.csv")
csv2 = pd.read_csv("submission_XGB_summer_holidays.csv")

y1 = csv1["log_bike_count"]
y2 = csv2["log_bike_count"]

In [571]:
y_pred = 0.5*y1 + 0.5*y2

In [540]:
y_pred

0        0.384366
1        1.484280
2        2.146217
3        0.878931
4        0.781125
           ...   
51435    4.553564
51436    4.764805
51437    5.114223
51438    4.612113
51439    3.827440
Name: log_bike_count, Length: 51440, dtype: float64