In [68]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint, uniform
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression


In [69]:
train_df = pd.read_csv(r"regression-dataset-train.csv")
test_df = pd.read_csv(r"regression-dataset-test-unlabeled.csv")

In [70]:
train_df

Unnamed: 0,id,date,season_id,year,month,is_holiday,weekday,is_workingday,weather_condition,temperature,feels_like_temp,humidity,wind_speed,total_users
0,577,31-07-2019,3,1,7,0,2,1,1,29.246653,33.14480,70.4167,11.083475,7216
1,427,03-03-2019,1,1,3,0,6,0,2,16.980847,20.67460,62.1250,10.792293,4066
2,729,30-12-2019,1,1,12,0,0,0,1,10.489153,11.58500,48.3333,23.500518,1796
3,483,28-04-2019,2,1,4,0,6,0,2,15.443347,18.87520,48.9583,8.708325,4220
4,112,22-04-2018,2,0,4,0,5,1,2,13.803347,16.09770,72.9583,14.707907,1683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,579,02-08-2019,3,1,8,0,4,1,1,30.852500,35.35440,65.9583,8.666718,7261
506,54,23-02-2018,1,0,2,0,3,1,1,9.091299,12.28585,42.3043,6.305571,1917
507,351,17-12-2018,4,0,12,0,6,0,2,10.591653,12.46855,56.0833,16.292189,2739
508,80,21-03-2018,2,0,3,0,1,1,2,17.647835,20.48675,73.7391,19.348461,2077


In [71]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 510 non-null    int64  
 1   date               510 non-null    object 
 2   season_id          510 non-null    int64  
 3   year               510 non-null    int64  
 4   month              510 non-null    int64  
 5   is_holiday         510 non-null    int64  
 6   weekday            510 non-null    int64  
 7   is_workingday      510 non-null    int64  
 8   weather_condition  510 non-null    int64  
 9   temperature        510 non-null    float64
 10  feels_like_temp    510 non-null    float64
 11  humidity           510 non-null    float64
 12  wind_speed         510 non-null    float64
 13  total_users        510 non-null    int64  
dtypes: float64(4), int64(9), object(1)
memory usage: 55.9+ KB


In [72]:
train_df["date"] = pd.to_datetime(train_df["date"], format="%d-%m-%Y")
test_df["date"] = pd.to_datetime(test_df["date"], format="%d-%m-%Y")

In [73]:
for df in [train_df, test_df]:
    df["day_of_year"] = df["date"].dt.dayofyear
    df["week"] = df["date"].dt.isocalendar().week.astype(int)
    df["day"] = df["date"].dt.day
    df["is_start_of_quarter"] = df["month"].isin([1, 4, 7, 10]).astype(int)
    # df["is_first_day_of_month"] = (df["date"].dt.day == 1).astype(int)
    # df["is_last_day_of_month"] = df["date"].dt.is_month_end.astype(int)
    df["is_weekend"] = df["weekday"].isin([4, 5]).astype(int)
    df["is_weekstart"] = df["weekday"].isin([6, 1]).astype(int)
    df["is_salary_period"] = df["date"].dt.day.apply(lambda d: 1 if (d >= 27 or d <= 4) else 0)

train_df = train_df.drop(columns=["date"])
test_df = test_df.drop(columns=["date"])

In [74]:
def get_month_group(month):
    if month in [1, 2]:
        return 1
    elif month in [3, 4, 5, 10, 11, 12]:
        return 2
    elif month in [6, 7, 8, 9]:
        return 3

train_df["month_group"] = train_df["month"].apply(get_month_group)
test_df["month_group"] = test_df["month"].apply(get_month_group)

In [75]:
train_df

Unnamed: 0,id,season_id,year,month,is_holiday,weekday,is_workingday,weather_condition,temperature,feels_like_temp,...,wind_speed,total_users,day_of_year,week,day,is_start_of_quarter,is_weekend,is_weekstart,is_salary_period,month_group
0,577,3,1,7,0,2,1,1,29.246653,33.14480,...,11.083475,7216,212,31,31,1,0,0,1,3
1,427,1,1,3,0,6,0,2,16.980847,20.67460,...,10.792293,4066,62,9,3,0,0,1,1,2
2,729,1,1,12,0,0,0,1,10.489153,11.58500,...,23.500518,1796,364,1,30,0,0,0,1,2
3,483,2,1,4,0,6,0,2,15.443347,18.87520,...,8.708325,4220,118,17,28,1,0,1,1,2
4,112,2,0,4,0,5,1,2,13.803347,16.09770,...,14.707907,1683,112,16,22,1,1,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,579,3,1,8,0,4,1,1,30.852500,35.35440,...,8.666718,7261,214,31,2,0,1,0,1,3
506,54,1,0,2,0,3,1,1,9.091299,12.28585,...,6.305571,1917,54,8,23,0,0,0,0,1
507,351,4,0,12,0,6,0,2,10.591653,12.46855,...,16.292189,2739,351,51,17,0,0,1,0,2
508,80,2,0,3,0,1,1,2,17.647835,20.48675,...,19.348461,2077,80,12,21,0,0,1,0,2


In [76]:
categorical_cols = ["season_id", "weather_condition", "weekday"]


for col in categorical_cols:
    unique_vals = train_df[col].unique()
    print(f"Unique values in '{col}': {sorted(unique_vals)}")

Unique values in 'season_id': [1, 2, 3, 4]
Unique values in 'weather_condition': [1, 2, 3]
Unique values in 'weekday': [0, 1, 2, 3, 4, 5, 6]


In [77]:
train_df["temp_diff"] = train_df["feels_like_temp"] - train_df["temperature"].abs()
test_df["temp_diff"] = test_df["feels_like_temp"] - test_df["temperature"].abs()

In [78]:
train_df.groupby("weather_condition")["total_users"].mean().sort_values()

weather_condition
3    1947.733333
2    4029.727273
1    4856.031348
Name: total_users, dtype: float64

In [79]:
train_df["is_rainy"] = train_df["weather_condition"].apply(lambda x: 1 if x >= 3 else 0)
test_df["is_rainy"] = test_df["weather_condition"].apply(lambda x: 1 if x >= 3 else 0)

In [80]:
for df in [train_df, test_df]:
    df["heat_humidity"] = df["temperature"] * df["humidity"]
    df["wind_chill_effect"] = df["wind_speed"] * df["temp_diff"]
    df["rain_and_wind"] = df["is_rainy"] * df["wind_speed"]
    df["temp_x_wind"] = df["temperature"] * df["wind_speed"]
    df["humidity_x_wind"] = df["humidity"] * df["wind_speed"]

    df["feels_good_zone"] = df["feels_like_temp"].apply(lambda x: 1 if 15 <= x <= 30 else 0)
    df["is_warm_day"] = (df["temperature"] > 30).astype(int)
    df["is_cold_day"] = (df["temperature"] < 10).astype(int)
    
    df["is_windy"] = (df["wind_speed"] >= 20).astype(int)
    df["is_humid"] = (df["humidity"] >= 70).astype(int)


In [81]:
train_df = train_df.drop(columns=[col for col in ["is_rainy"] if col in train_df.columns])
test_df = test_df.drop(columns=[col for col in ["is_rainy"] if col in test_df.columns])

In [82]:
categorical_cols = ["season_id", "weekday"]

train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

train_df = train_df.astype({col: int for col in train_df.select_dtypes(bool).columns})
test_df = test_df.astype({col: int for col in test_df.select_dtypes(bool).columns})

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 39 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   510 non-null    int64  
 1   year                 510 non-null    int64  
 2   month                510 non-null    int64  
 3   is_holiday           510 non-null    int64  
 4   is_workingday        510 non-null    int64  
 5   weather_condition    510 non-null    int64  
 6   temperature          510 non-null    float64
 7   feels_like_temp      510 non-null    float64
 8   humidity             510 non-null    float64
 9   wind_speed           510 non-null    float64
 10  total_users          510 non-null    int64  
 11  day_of_year          510 non-null    int32  
 12  week                 510 non-null    int32  
 13  day                  510 non-null    int32  
 14  is_start_of_quarter  510 non-null    int32  
 15  is_weekend           510 non-null    int

In [83]:
train_df

Unnamed: 0,id,year,month,is_holiday,is_workingday,weather_condition,temperature,feels_like_temp,humidity,wind_speed,...,is_humid,season_id_2,season_id_3,season_id_4,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,577,1,7,0,1,1,29.246653,33.14480,70.4167,11.083475,...,1,0,1,0,0,1,0,0,0,0
1,427,1,3,0,0,2,16.980847,20.67460,62.1250,10.792293,...,0,0,0,0,0,0,0,0,0,1
2,729,1,12,0,0,1,10.489153,11.58500,48.3333,23.500518,...,0,0,0,0,0,0,0,0,0,0
3,483,1,4,0,0,2,15.443347,18.87520,48.9583,8.708325,...,0,1,0,0,0,0,0,0,0,1
4,112,0,4,0,1,2,13.803347,16.09770,72.9583,14.707907,...,1,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,579,1,8,0,1,1,30.852500,35.35440,65.9583,8.666718,...,0,0,1,0,0,0,0,1,0,0
506,54,0,2,0,1,1,9.091299,12.28585,42.3043,6.305571,...,0,0,0,0,0,0,1,0,0,0
507,351,0,12,0,0,2,10.591653,12.46855,56.0833,16.292189,...,0,0,0,1,0,0,0,0,0,1
508,80,0,3,0,1,2,17.647835,20.48675,73.7391,19.348461,...,1,1,0,0,1,0,0,0,0,0


In [84]:
X_train = train_df.drop(columns=["id", "total_users"])
y_train = train_df["total_users"]
X_test = test_df.drop(columns=["id"])
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

correlations = X_train.copy()
correlations["total_users"] = y_train
correlation_series = correlations.corr(numeric_only=True)["total_users"].drop("total_users").sort_values(ascending=False)

X_train_const = sm.add_constant(X_train).astype(float)
y_train = y_train.astype(float)
model = sm.OLS(y_train, X_train_const).fit()
p_values = model.pvalues.drop("const").sort_values()

df_pvalue = pd.DataFrame({
    "feature": p_values.index,
    "p_value": p_values.values
}).sort_values(by="p_value").reset_index(drop=True)

correlation_series_sorted = correlation_series.reindex(correlation_series.abs().sort_values(ascending=False).index)

df_corr = pd.DataFrame({
    "feature": correlation_series_sorted.index,
    "correlation_with_target": correlation_series_sorted.values
}).reset_index(drop=True)

pd.concat([df_pvalue.tail(50), df_corr.tail(50)], axis=1)

Unnamed: 0,feature,p_value,feature.1,correlation_with_target
0,year,1.922444e-109,feels_like_temp,0.647944
1,is_warm_day,4.309454e-12,temperature,0.645048
2,season_id_4,1.7112e-10,year,0.591433
3,weather_condition,1.910316e-09,month_group,0.579747
4,season_id_2,4.377005e-08,heat_humidity,0.477082
5,rain_and_wind,8.970306e-07,is_cold_day,-0.428916
6,is_workingday,1.064925e-05,season_id_3,0.37221
7,weekday_6,1.387008e-05,temp_diff,0.344897
8,feels_good_zone,3.639082e-05,week,0.298892
9,season_id_3,0.0004845355,temp_x_wind,0.29058


In [85]:
p_value_threshold = 0.4
corr_threshold = 0.01

selected_by_pval = df_pvalue[df_pvalue["p_value"] < p_value_threshold]["feature"]

selected_by_corr = df_corr[df_corr["correlation_with_target"].abs() > corr_threshold]["feature"]

selected_features = list(set(selected_by_pval).intersection(set(selected_by_corr)))

X_selected = X_train[selected_features]

print(f"{len(selected_features)} features selected:\n", selected_features)


27 features selected:
 ['rain_and_wind', 'temp_diff', 'is_windy', 'wind_chill_effect', 'season_id_4', 'humidity_x_wind', 'week', 'year', 'is_humid', 'season_id_2', 'weekday_3', 'feels_good_zone', 'weekday_5', 'is_holiday', 'heat_humidity', 'season_id_3', 'weekday_6', 'is_weekstart', 'is_start_of_quarter', 'is_warm_day', 'weather_condition', 'temperature', 'month_group', 'is_weekend', 'is_workingday', 'temp_x_wind', 'is_cold_day']


In [86]:
corr_matrix = X_selected.corr().abs()

upper_triangle = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)

corr_pairs = (
    corr_matrix.where(upper_triangle)
    .stack()
    .reset_index()
)

corr_pairs.columns = ["feature_1", "feature_2", "abs_correlation"]

corr_pairs_sorted = corr_pairs.sort_values(by="abs_correlation", ascending=False).reset_index(drop=True)

print(corr_pairs_sorted.head(10))

           feature_1          feature_2  abs_correlation
0      heat_humidity        temperature         0.871211
1        temperature        month_group         0.832847
2        season_id_3        month_group         0.731492
3      heat_humidity        month_group         0.716042
4        season_id_4               week         0.704953
5        season_id_3        temperature         0.697012
6          temp_diff  wind_chill_effect         0.660421
7  wind_chill_effect        temp_x_wind         0.654288
8          weekday_6       is_weekstart         0.644932
9          weekday_5         is_weekend         0.643460


In [87]:
pval_dict = dict(zip(df_pvalue["feature"], df_pvalue["p_value"]))

high_corr_pairs = corr_pairs_sorted[corr_pairs_sorted["abs_correlation"] > 0.9]

features_to_remove = set()

for _, row in high_corr_pairs.iterrows():
    f1, f2 = row["feature_1"], row["feature_2"]

    if f1 in features_to_remove or f2 in features_to_remove:
        continue

    p1 = pval_dict.get(f1, 1e6)
    p2 = pval_dict.get(f2, 1e6)

    if p1 > p2:
        features_to_remove.add(f1)
    else:
        features_to_remove.add(f2)

X_selected = X_selected.drop(columns=features_to_remove)

print(f"Removed {len(features_to_remove)} highly correlated features based on p-value.")
print("number of Remaining features:", X_selected.shape)


Removed 0 highly correlated features based on p-value.
number of Remaining features: (510, 27)


In [89]:
scaler = StandardScaler()

X_train_model, X_val_model, y_train_model, y_val_model = train_test_split(
    X_selected, y_train, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_model_scaled = scaler.fit_transform(X_train_model)
X_val_model_scaled = scaler.transform(X_val_model)

# rf_param_dist = {
#     "n_estimators": randint(200, 500),
#     "max_depth": randint(5, 20),
#     "min_samples_split": randint(2, 10),
#     "min_samples_leaf": randint(1, 5)
# }

# gbr_param_dist = {
#     "n_estimators": randint(100, 500),
#     "learning_rate": uniform(0.01, 0.2),
#     "max_depth": randint(3, 10),
#     "subsample": uniform(0.6, 0.4),
#     "min_samples_split": randint(2, 10)
# }

xgb_param_dist = {
    "n_estimators": randint(100, 500),
    "learning_rate": uniform(0.01, 0.2),
    "max_depth": randint(3, 10),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4)
}

# rf_search = RandomizedSearchCV(
#     estimator=RandomForestRegressor(random_state=42),
#     param_distributions=rf_param_dist,
#     n_iter=30, 
#     cv=5,
#     scoring="neg_mean_squared_error",
#     n_jobs=-1, random_state=42
# )

# gbr_search = RandomizedSearchCV(
#     estimator=GradientBoostingRegressor(random_state=42),
#     param_distributions=gbr_param_dist,
#     n_iter=30, 
#     cv=5,
#     scoring="neg_mean_squared_error",
#     n_jobs=-1, random_state=42
# )

xgb_search = RandomizedSearchCV(
    estimator=XGBRegressor(random_state=42, verbosity=0),
    param_distributions=xgb_param_dist,
    n_iter=30,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    random_state=42
)

# rf_search.fit(X_train_model, y_train_model)
# gbr_search.fit(X_train_model, y_train_model)
xgb_search.fit(X_train_model, y_train_model)

# best_rf = rf_search.best_estimator_
# best_gbr = gbr_search.best_estimator_
best_xgb = xgb_search.best_estimator_

models = {
    # "Linear Regression": LinearRegression(),
    # "Random Forest (Tuned)": best_rf,
    # "Gradient Boosting (Tuned)": best_gbr,
    "XGBoost (Tuned)": best_xgb,
}

def evaluate_model(name, model):
    if "Linear Regression" in name:
        model.fit(X_train_model_scaled, y_train_model)
        preds = model.predict(X_val_model_scaled)
    else:
        model.fit(X_train_model, y_train_model)
        preds = model.predict(X_val_model)

    return {
        "Model": name,
        "MSE": mean_squared_error(y_val_model, preds),
        "RMSE": np.sqrt(mean_squared_error(y_val_model, preds)),
        "MAE": mean_absolute_error(y_val_model, preds),
        "R2 Score": r2_score(y_val_model, preds),
        "MAPE (%)": mean_absolute_percentage_error(y_val_model, preds) * 100
    }

results = [evaluate_model(name, model) for name, model in models.items()]
results_df = pd.DataFrame(results).sort_values(by="MSE").reset_index(drop=True)

print(results_df)

             Model            MSE        RMSE         MAE  R2 Score   MAPE (%)
0  XGBoost (Tuned)  464195.645942  681.319049  462.049477  0.849553  15.727623


In [90]:
best_model_name = results_df.loc[0, "Model"]

final_model = models[best_model_name]
final_model.fit(X_selected, y_train)
X_test_selected = X_test[X_selected.columns]
test_predictions = np.round(final_model.predict(X_test_selected)).astype(int)

submission_df = test_df[["id"]].copy()
submission_df["label"] = test_predictions
submission_df.to_csv("submission.csv", index=False)
