In [50]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from joblib import Parallel, delayed
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor

In [51]:
#read the dataset and display the first few rows
df = pd.read_csv('sales.csv')
df.head()

Unnamed: 0,d,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,425390,366,4,2013-04-18,517,1,0,0,0,4422
1,291687,394,6,2015-04-11,694,1,0,0,0,8297
2,411278,807,4,2013-08-29,970,1,1,0,0,9729
3,664714,802,2,2013-05-28,473,1,1,0,0,6513
4,540835,726,4,2013-10-10,1068,1,1,0,0,10882


In [52]:
#missing values
print(df.isnull().sum())

d                      0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
sales                  0
dtype: int64


In [53]:
#printing data types
print(df.dtypes)

d                       int64
store_ID                int64
day_of_week             int64
date                   object
nb_customers_on_day     int64
open                    int64
promotion               int64
state_holiday          object
school_holiday          int64
sales                   int64
dtype: object


In [54]:
#dropping object type columns 
#update with a check
df = df.drop(columns=[col for col in ["d", "date"] if col in df.columns])

In [55]:
# Transforming categorical variable into numerical
df['state_holiday'] = LabelEncoder().fit_transform(df['state_holiday'])
print(df.dtypes)

store_ID               int64
day_of_week            int64
nb_customers_on_day    int64
open                   int64
promotion              int64
state_holiday          int64
school_holiday         int64
sales                  int64
dtype: object


In [56]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=30, max_depth=4, n_jobs=-1),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Step 2: Function to train and evaluate a model
def train_and_evaluate(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    score = r2_score(y_test, predictions)
    return name, score, predictions, model  # <-- Added model to return, so later we have it to predict

# Step 3: Run all models in parallel
def run_models_parallel(X_train, y_train, X_test, y_test):
    results = Parallel(n_jobs=3)(
        delayed(train_and_evaluate)(name, model, X_train, y_train, X_test, y_test)
        for name, model in models.items()
    )
    return results

In [57]:
#Bagging Technique
# Run the models and get results
results = run_models_parallel(X_train, y_train, X_test, y_test)

# Step 5: Print R2 scores
for name, score, _, _ in results: #here I had to include 4 instead of 3
    print(f"{name} R2 Score: {score:.4f}")


Linear Regression R2 Score: 0.8507
Random Forest R2 Score: 0.8427
Gradient Boosting R2 Score: 0.8868


In [58]:
# Step 6: Create bagged predictions (average of predictions)
all_preds = np.array([preds for _, _, preds, _ in results]) #here I had to include 4 instead of 3
bagged_preds = np.mean(all_preds, axis=0)

In [59]:
# Step 7: Add bagged predictions as a new column in test data
X_test_with_preds = X_test.copy()
X_test_with_preds['Predicted_Sales_Bagged'] = bagged_preds
bagged_r2 = r2_score(y_test, bagged_preds)

# Optional: Show some results
print(X_test_with_preds[['Predicted_Sales_Bagged']].head())
print('R2 Score for Bagged:', bagged_r2)

        Predicted_Sales_Bagged
360263             9630.241994
226238              -44.728537
94660              4696.128616
322601             4910.190749
450235             5814.751455
R2 Score for Bagged: 0.8695098047555108


In [506]:
#identify categorical columns (if any remain after drop)
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
#fill missing numerical values with median
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
for col in num_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


In [507]:
# Load real-life data
df_real = pd.read_csv("REAL_DATA.csv")

#make sure 'sales' is not in the features to avoid unseen columns)
if 'sales' in df_real_processed.columns:
    df_real_features = df_real_processed.drop(columns=['sales'])
else:
    df_real_features = df_real_processed

# Predict sales on real-life data (correct dataframe!)
sales_pred = best_model.predict(df_real_features)

# Add predictions to dataframe
df_real_processed['sales'] = sales_pred

# Show predictions
print(df_real_processed[['sales']].head())

         sales
0    98.083336
1    78.622962
2  6879.770648
3  7107.528079
4  6102.075624


In [508]:
# readind real data
new_df = pd.read_csv('REAL_DATA.csv').copy()

In [None]:
#getting real data data types
print(new_df.dtypes)

index                   int64
store_ID                int64
day_of_week             int64
date                   object
nb_customers_on_day     int64
open                    int64
promotion               int64
state_holiday          object
school_holiday          int64
dtype: object


In [None]:
#checking missing values
new_missing_data = new_df.isnull().sum()
print(new_missing_data)

index                  0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
dtype: int64


In [63]:
# Encode categorical features
new_df['state_holiday'] = LabelEncoder().fit_transform(new_df['state_holiday'])
new_df['date'] = LabelEncoder().fit_transform(new_df['date'])

# Prepare features for prediction (drop columns not used in training)
features = [new_data for new_data in X.columns if new_data in new_df.columns]
X_new = new_df[features]

# Fit the models on the original training data if not already fitted
for name, model in models.items():
    if not hasattr(model, "coef_") and not hasattr(model, "feature_importances_"):
        model.fit(X_train, y_train)

# Predict with each trained model from the models dictionary
preds_linear = models["Linear Regression"].predict(X_new)
preds_rf = models["Random Forest"].predict(X_new)
preds_gb = models["Gradient Boosting"].predict(X_new)

# Average predictions — bagging
bagged_preds_new = np.mean([preds_linear, preds_rf, preds_gb], axis=0)
# Add predictions to DataFrame
new_df['sales'] = bagged_preds_new
new_df.to_csv("G1.csv", index=False)

# Optional: If your new data has actual sales, evaluate R²
if 'sales' in new_df.columns:
    r2_new = r2_score(new_df['sales'], bagged_preds_new)
    print(f"R² Score on new data: {r2_new:.4f}")

# Show sample output
print(new_df[['sales']].head())


R² Score on new data: 1.0000
         sales
0    26.125596
1    38.384082
2  6708.813381
3  6987.469395
4  6290.938492


In [64]:
#verifying new file data
updated_file = pd.read_csv('G1.csv')
updated_file

Unnamed: 0,index,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,272371,415,7,8,0,0,0,0,0,26.125596
1,558468,27,7,894,0,0,0,0,0,38.384082
2,76950,404,3,565,657,1,1,0,0,6708.813381
3,77556,683,2,868,862,1,0,0,0,6987.469395
4,456344,920,3,565,591,1,1,0,0,6290.938492
...,...,...,...,...,...,...,...,...,...,...
71200,59062,441,7,801,0,0,0,0,0,0.618313
71201,687449,377,7,548,0,0,0,0,0,27.961473
71202,207393,15,3,326,648,1,0,0,0,5522.085995
71203,233378,950,2,691,626,1,1,0,0,6534.898629


Observations:
-	To run models parallelly we need to enable joblib library and import parallel
-	LabelEncoder() function is used to fit transformation of the columns from categorical variables to numerical
-	When we code 3 models one after another, they are running sequentially as we need to run parallelly, we need to define the models in one dictionary and runs the function works 
-	While running multiple models we have observed below points.
o	We cannot use Logistical Regression as this is used for classification tasks but not for regression tasks. So it didn’t work
o	We try to run KNN method, but this method is too slow to run
o	We tried running RandomForestRegression Method and GradientBoostingRegression Method which runs better but if we give print function it will run too slow. 
o	Hence segregated the print function to another cell after running the models
-	While creating bagged technique we found that we don’t need to use any specific functions like baggedregression to run the predictions. Np.mean will works to run bagged technique. 
o	In this step we have [preds for -,-, preds], in this -, - refers to the returned parameters from train_test_split model which are name, score, predictions. – is given if that parameter is not required for this conditions.
o	This will help to exclude running these parameters details.
-	As per mathematical calculation bagged prediction for all 3 models is 0.868
-	To run the parallel code and effectively save and load python. Tried Joblib to run real data on the trained model however it seems bit complicated as we don’t have much Idea on it so we moved on with different code.
-	Hasattr is used to check if an object has a specific attribute
-	If we are running multiple datasets on same trained model, we don’t need to re write the code to call model and train_test_split instead we can use the syntax[“modelName”].predict() to predict the values
-	Overall we understand that we need more handson coding and we need to practice more 😊


