In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error

In [69]:
df=pd.read_csv("BigMart_Train.csv")

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8522 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [71]:
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       1
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [72]:
missing_columns = df.columns[df.isnull().sum() > 0]

for col in missing_columns:
    if df[col].dtype == "object":
        df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0],inplace=True)


In [73]:
categorical_cols = df.select_dtypes(include=["object"]).columns

for col in categorical_cols:
    le=LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   int64  
 1   Item_Weight                8523 non-null   float64
 2   Item_Fat_Content           8523 non-null   int64  
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   int64  
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   int64  
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   int64  
 9   Outlet_Location_Type       8523 non-null   int64  
 10  Outlet_Type                8523 non-null   int64  
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(8)
memory usage: 799.2 KB


In [75]:
x = df.drop("Item_Outlet_Sales", axis=1)
y = df["Item_Outlet_Sales"]

In [76]:
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.3, random_state=42)

In [77]:
boosting_model=AdaBoostRegressor(n_estimators=100, random_state=42)
boosting_model.fit(x_train,y_train)
boosting_pred=boosting_model.predict(x_test)
boosting_accuracy=r2_score(y_test, boosting_pred)
boosting_accuracy=mean_squared_error(y_test, boosting_pred)

In [78]:
boosting_model

In [79]:
print(f"boosting_accuracy: {boosting_accuracy}")

boosting_accuracy: 1381201.754366009


In [80]:

bagging_model = BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=100, random_state=42)
bagging_model.fit(x_train,y_train)
bagging_pred = bagging_model.predict(x_test)
bagging_accuracy = r2_score(y_test, bagging_pred)
bagging_accuracy = mean_squared_error(y_test, bagging_pred)

print("R2 Score:", bagging_r2)
print("Mean Squared Error:", bagging_mse)


R2 Score: 0.5575222067467567
Mean Squared Error: 1239377.434903505


In [88]:
base_models = [
    ("decision_tree", DecisionTreeRegressor(random_state=42)),
    ("random_forest", RandomForestRegressor(random_state=42, n_estimators=100)),
    #("lg", LogisticRegression(max_iter=1000))
]

In [89]:
stacking_model=StackingRegressor(estimators=base_models, final_estimator= KNeighborsRegressor(), cv=5)
stacking_model.fit(x_train,y_train)
stacking_preds=stacking_model.predict(x_test)

In [93]:
stacking_r2 = r2_score(y_test, stacking_preds)
print(f"Stacking R²: {stacking_r2:.2f}")

stacking_mse = mean_squared_error(y_test, stacking_preds)
print(f"Stacking MSE: {stacking_mse:.2f}")


Stacking R²: 0.48
Stacking MSE: 1466971.99


In [94]:
x_train_blend, x_val, y_train_blend, y_val=train_test_split(x_train,y_train, test_size=0.3, random_state=42)

In [95]:
for name, model in base_models:
    model.fit(x_train_blend, y_train_blend)

In [96]:
val_pred= np.column_stack([model.predict(x_val) for _, model in base_models])

In [98]:
meta_model = KNeighborsRegressor()
meta_model.fit(val_pred, y_val)

In [100]:
test_preds=np.column_stack([model.predict(x_test) for _, model in base_models])
blending_preds = meta_model.predict(test_preds)

In [101]:
blending_r2 = r2_score(y_test, blending_preds)
print(f"Blendingg R²: {blending_r2:.2f}")

blending_mse = mean_squared_error(y_test, blending_preds)
print(f"Blending MSE: {blending_mse:.2f}")


Blendingg R²: 0.48
Blending MSE: 1466064.37


In [108]:
print("\nBase Model Evaluations:")
for name, model in base_models:
    model.fit(x_train, y_train)
    base_preds = model.predict(x_test)

    mse = mean_squared_error(y_test, base_preds)
    r2 = r2_score(y_test, base_preds)
    
    print(f"{name.capitalize()} MSE: {mse:.2f}, R²: {r2:.2f}")



Base Model Evaluations:
Decision_tree MSE: 2227393.12, R²: 0.20
Random_forest MSE: 1243011.10, R²: 0.56


In [109]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the model
model = RandomForestRegressor()

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

# Fit the model to the data
grid_search.fit(x_train, y_train)

# Best parameters found by GridSearchCV
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

# Calculate MAE or R²
from sklearn.metrics import mean_squared_error, r2_score
mae = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Model mse: {mse:.2f}")
print(f"Best Model R²: {r2:.2f}")


Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
Best Model mse: 1243011.10
Best Model R²: 0.59


In [122]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
x_train_scaled = scaler.fit_transform(x_train)

# Transform the test data using the same scaler
x_test_scaled = scaler.transform(x_test)

# Define and train the model
model = LinearRegression()
model.fit(x_train_scaled, y_train)

# Predict on the scaled test data
y_pred_scaled = model.predict(x_test_scaled)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred_scaled)
r2 = r2_score(y_test, y_pred_scaled)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"R²: {r2:.2f}")


Mean Absolute Error: 872.63
R²: 0.51
