In [None]:


import pandas as pd
import numpy as np

file_name = r'C:\Users\hrawa\OneDrive\Documents\blinkit_data.csv'

try:
    df = pd.read_csv(r'C:\Users\hrawa\OneDrive\Documents\blinkit_data.csv')
    print(f"Successfully loaded '{r'C:\Users\hrawa\OneDrive\Documents\blinkit_data.csv'}'")
except FileNotFoundError:
    print(f"ERROR: File not found. I was looking for it here: {r'C:\Users\hrawa\OneDrive\Documents\blinkit_data.csv'}")
except Exception as e:
    print(f"An error occurred: {e}")


if 'df' in locals():
    # 1. Fill missing 'Item Visibility' (with a space)
    mean_visibility = df['Item Visibility'].mean()
    df['Item Visibility'].fillna(mean_visibility, inplace=True)

    # 2. Fill missing 'Outlet Size' (with a space)
    mode_size = df['Outlet Size'].mode()[0]
    df['Outlet Size'].fillna(mode_size, inplace=True)

    # 3. Fix inconsistent 'Item Fat Content' (with a space)
    df['Item Fat Content'] = df['Item Fat Content'].replace({
        'low fat': 'Low Fat',
        'LF': 'Low Fat',
        'reg': 'Regular'
    })

    print("Data cleaning complete.")
    print("\nHere's a preview of your cleaned data:")
    print(df.head())

Successfully loaded 'C:\Users\hrawa\OneDrive\Documents\blinkit_data.csv'
Data cleaning complete.

Here's a preview of your cleaned data:
  Item Fat Content  Sr.NO. Item Identifier              Item Type  \
0          Regular       1           FDX32  Fruits and Vegetables   
1          Low Fat       2           NCB42     Health and Hygiene   
2          Regular       3           FDR28           Frozen Foods   
3          Regular       4           FDL50                 Canned   
4          Low Fat       5           DRI25            Soft Drinks   

   Outlet Establishment Year Outlet Identifier Outlet Location Type  \
0                       2012            OUT049               Tier 1   
1                       2022            OUT018               Tier 3   
2                       2016            OUT046               Tier 1   
3                       2014            OUT013               Tier 3   
4                       2015            OUT045               Tier 2   

  Outlet Size        

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Item Visibility'].fillna(mean_visibility, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Outlet Size'].fillna(mode_size, inplace=True)


In [4]:
# === STEP 2: FEATURE ENGINEERING (Corrected) ===

# 1. Create 'Outlet_Age'
# We'll use 2025 as the "current year" since this is historical data
df['Outlet_Age'] = 2025 - df['Outlet Establishment Year']

# 2. Convert text columns to numbers using One-Hot Encoding
# These are the *only* columns we want to encode.
# We are LEAVING OUT 'Item Identifier' on purpose.
categorical_cols = [
    'Item Fat Content',
    'Item Type',
    'Outlet Identifier',
    'Outlet Size',
    'Outlet Location Type',
    'Outlet Type'
]

# This will create the True/False columns ONLY for the list above
df_processed = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("Feature Engineering complete.")
print("\nHere's a preview of your NEW processed data:")
print(df_processed.head())

Feature Engineering complete.

Here's a preview of your NEW processed data:
   Sr.NO. Item Identifier  Outlet Establishment Year  Item Visibility  \
0       1           FDX32                       2012         0.100014   
1       2           NCB42                       2022         0.008596   
2       3           FDR28                       2016         0.025896   
3       4           FDL50                       2014         0.042278   
4       5           DRI25                       2015         0.033970   

   Item Weight     Sales  Rating  Outlet_Age  Item Fat Content_Regular  \
0        15.10  145.4786     5.0          13                      True   
1        11.80  115.3492     5.0           3                     False   
2        13.85  165.0210     5.0           9                      True   
3        12.15  126.5046     5.0          11                      True   
4        19.60   55.1614     5.0          10                     False   

   Item Fat Content_low Fat  ...  Outlet

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


print("Preparing Data for Final Model...")


y = df_processed['Sales'] 

columns_to_drop = [
    'Sales',    
    'Rating',   
    'Item Identifier', 
    'Outlet Establishment Year',
    'Sr.NO.'  
]
.
X = df_processed.drop(columns=columns_to_drop, errors='ignore')

print("Features (X) and Target (y) are defined.")

print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Data split: {len(X_train)} rows for training, {len(X_test)} rows for testing.")

print("\nTraining FINAL Random Forest model...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("--- FINAL MODEL TRAINING COMPLETE ---")
print(f"Model R-Squared (R²): {r2:.4f}")

print("\nFinding Top Sales Drivers...")

importances = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\n--- FINAL TOP 10 DRIVERS OF SALES ---")
print(feature_importance_df.head(10))

Preparing Data for Final Model...
Features (X) and Target (y) are defined.

Splitting data...
Data split: 6818 rows for training, 1705 rows for testing.

Training FINAL Random Forest model...
--- FINAL MODEL TRAINING COMPLETE ---
Model R-Squared (R²): 0.5713

Finding Top Sales Drivers...

--- FINAL TOP 10 DRIVERS OF SALES ---
                            Feature  Importance
0                   Item Visibility    0.349381
1                       Item Weight    0.310103
3          Item Fat Content_Regular    0.036236
2                        Outlet_Age    0.025640
8                   Item Type_Dairy    0.023279
10  Item Type_Fruits and Vegetables    0.023179
17            Item Type_Snack Foods    0.022610
9            Item Type_Frozen Foods    0.022106
7                  Item Type_Canned    0.019896
13              Item Type_Household    0.017051
