In [495]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from joblib import Parallel, delayed
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [496]:
#read the dataset and display the first few rows
df = pd.read_csv('sales.csv')
df.head()

Unnamed: 0,d,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,425390,366,4,2013-04-18,517,1,0,0,0,4422
1,291687,394,6,2015-04-11,694,1,0,0,0,8297
2,411278,807,4,2013-08-29,970,1,1,0,0,9729
3,664714,802,2,2013-05-28,473,1,1,0,0,6513
4,540835,726,4,2013-10-10,1068,1,1,0,0,10882


In [497]:
#missing values
print(df.isnull().sum())

d                      0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
sales                  0
dtype: int64


In [498]:
#printing data types
print(df.dtypes)

d                       int64
store_ID                int64
day_of_week             int64
date                   object
nb_customers_on_day     int64
open                    int64
promotion               int64
state_holiday          object
school_holiday          int64
sales                   int64
dtype: object


In [499]:
#dropping object type columns 
#update with a check
df = df.drop(columns=[col for col in ["d", "date"] if col in df.columns])

In [500]:
# Transforming categorical variable into numerical
df['state_holiday'] = LabelEncoder().fit_transform(df['state_holiday'])
print(df.dtypes)

store_ID               int64
day_of_week            int64
nb_customers_on_day    int64
open                   int64
promotion              int64
state_holiday          int64
school_holiday         int64
sales                  int64
dtype: object


In [501]:
#Train-Test-Split initialization
X = df.drop(columns="sales") # sending all columns except sales to X
y = df['sales'] # target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(512672, 7) (128168, 7) (512672,) (128168,)


In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=30, max_depth=4, n_jobs=-1),
    # "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    # "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Step 2: Function to train and evaluate a model
def train_and_evaluate(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    score = r2_score(y_test, predictions)
    return name, score, predictions, model  # <-- Added model to return, so later we have it to predict

# Step 3: Run all models in parallel
def run_models_parallel(X_train, y_train, X_test, y_test):
    results = Parallel(n_jobs=3)(
        delayed(train_and_evaluate)(name, model, X_train, y_train, X_test, y_test)
        for name, model in models.items()
    )
    return results

In [None]:
#Bagging Technique
# Run the models and get results
results = run_models_parallel(X_train, y_train, X_test, y_test)

# Step 5: Print R2 scores
for name, score, _, _ in results: #here I had to include 4 instead of 3
    print(f"{name} R2 Score: {score:.4f}")


Linear Regression R2 Score: 0.8507
Random Forest R2 Score: 0.8438
Gradient Boosting R2 Score: 0.8871


In [None]:
# Step 6: Create bagged predictions (average of predictions)
all_preds = np.array([preds for _, _, preds, _ in results]) #here I had to include 4 instead of 3
bagged_preds = np.mean(all_preds, axis=0)

In [505]:
# Step 7: Add bagged predictions as a new column in test data
X_test_with_preds = X_test.copy()
X_test_with_preds['Predicted_Sales_Bagged'] = bagged_preds
bagged_r2 = r2_score(y_test, bagged_preds)

# Optional: Show some results
print(X_test_with_preds[['Predicted_Sales_Bagged']].head())
print('R2 Score for Bagged:', bagged_r2)

        Predicted_Sales_Bagged
360263             9626.817402
226238              -47.022976
94660              4652.268140
322601             4890.540982
450235             5952.622249
R2 Score for Bagged: 0.8698026850147129


In [506]:
#identify categorical columns (if any remain after drop)
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
#fill missing numerical values with median
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in num_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


In [507]:
# Load real-life data
df_real = pd.read_csv("REAL_DATA.csv")

#make sure 'sales' is not in the features to avoid unseen columns)
if 'sales' in df_real_processed.columns:
    df_real_features = df_real_processed.drop(columns=['sales'])
else:
    df_real_features = df_real_processed

# Predict sales on real-life data (correct dataframe!)
sales_pred = best_model.predict(df_real_features)

# Add predictions to dataframe
df_real_processed['sales'] = sales_pred

# Show predictions
print(df_real_processed[['sales']].head())

         sales
0    98.083336
1    78.622962
2  6879.770648
3  7107.528079
4  6102.075624


In [508]:
#save the new file with salec column:
df_real_features.to_csv("G1.csv", index=False)

Observations:

