<a href="https://colab.research.google.com/github/Husayn01/Machine-Learning-Projects/blob/main/practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import optuna
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

In [135]:
# NEW: Helper function for column consistency
def ensure_column_consistency(train_df, test_df, target_col):
    """Ensure train and test have same columns"""
    train_cols = set(train_df.columns) - {target_col}
    test_cols = set(test_df.columns)
    common_cols = list(train_cols.intersection(test_cols))
    return common_cols

In [136]:
# Load data
print("📊 Loading data...")
train_df = pd.read_csv("https://raw.githubusercontent.com/Husayn01/Machine-Learning-Projects/refs/heads/main/Excellent%20Store%20Challenge/data/train.csv")
test_df = pd.read_csv("https://raw.githubusercontent.com/Husayn01/Machine-Learning-Projects/refs/heads/main/Excellent%20Store%20Challenge/data/test.csv")
sub_sample_df = pd.read_csv("https://raw.githubusercontent.com/Husayn01/Machine-Learning-Projects/refs/heads/main/Excellent%20Store%20Challenge/data/SampleSubmission.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

📊 Loading data...
Train shape: (4990, 13)
Test shape: (3532, 12)


In [137]:
test_data = test_df.copy()

In [138]:
def preprocess(df):
    """Clean and handle missing values"""
    print("🛠️ Cleaning data...")

    # Handle missing Item_Weight
    if df["Item_Weight"].isnull().any():
      median_weight = df["Item_Weight"].median()
      missing_count = df["Item_Weight"].isnull().sum()
      df["Item_Weight"].fillna(median_weight, inplace=True)
      print(f"Filled {missing_count} missing values in 'Item_Weight' with the median.")

    # Handle missing Store_Size
    if df["Store_Size"].isnull().any() and "Store_Size" in df.columns:
      missing_count = df["Store_Size"].isnull().sum()
      df["Store_Size"].fillna("Unknown", inplace=True)
      print(f"Filled {missing_count} missing values in 'Store_Size' with 'Unknown'.")

    return df

In [139]:
percentage_missing = (train_df.isna().sum() / len(train_df)) * 100
print(percentage_missing)

Item_ID                 0.000000
Store_ID                0.000000
Item_Store_ID           0.000000
Item_Weight            16.072144
Item_Sugar_Content      0.000000
Item_Visibility         0.000000
Item_Type               0.000000
Item_Price              0.000000
Store_Start_Year        0.000000
Store_Size             29.058116
Store_Location_Type     0.000000
Store_Type              0.000000
Item_Store_Returns      0.000000
dtype: float64


In [140]:
train_preprocessed = preprocess(train_df)
test_preprocessed = preprocess(test_df)

🛠️ Cleaning data...
Filled 802 missing values in 'Item_Weight' with the median.
Filled 1450 missing values in 'Store_Size' with 'Unknown'.
🛠️ Cleaning data...
Filled 661 missing values in 'Item_Weight' with the median.
Filled 959 missing values in 'Store_Size' with 'Unknown'.


In [141]:
print(f"Missing values in train_preprocessed: {train_preprocessed.isna().sum().sum()}")
print(f"Missing values in test_preprocessed: {test_preprocessed.isna().sum().sum()}")

Missing values in train_preprocessed: 0
Missing values in test_preprocessed: 0


In [142]:
# OPTIMIZED: Fixed feature engineering function
def feature_engineering(df):
    """Create new features from existing data"""
    print("🔧 Engineering features...")

    # Only create features that exist in both datasets
    if 'Item_MRP' in df.columns and 'Item_Weight' in df.columns:
        df['Price_per_Weight'] = df['Item_MRP'] / (df['Item_Weight'] + 0.1)
        print("Created Price_per_Weight feature")

    if 'Item_Visibility' in df.columns:
        df['Visibility_Category'] = pd.cut(df['Item_Visibility'],
                                         bins=3, labels=['Low', 'Medium', 'High'])
        print("Created Visibility_Category feature")

    return df

In [143]:
train_fe = feature_engineering(train_preprocessed)
test_fe = feature_engineering(test_preprocessed)

🔧 Engineering features...
Created Visibility_Category feature
🔧 Engineering features...
Created Visibility_Category feature


In [144]:
# OPTIMIZED: Ensure column consistency
common_cols = ensure_column_consistency(train_fe, test_fe, 'Item_Store_Returns')
print(f"Common columns: {len(common_cols)}")

Common columns: 13


In [145]:
X = train_fe[common_cols]
y = train_fe['Item_Store_Returns']

In [146]:
numerical_cols = list(X.select_dtypes(include=['int64', 'float64']).columns)
categorical_cols = list(X.select_dtypes(include=['object']).columns)


In [147]:
# OPTIMIZED: Reduce categorical cardinality for speed
print("🚀 Optimizing categorical features...")
for col in categorical_cols:
    value_counts = X[col].value_counts()
    if len(value_counts) > 10:  # If more than 10 categories
        top_categories = value_counts.head(8).index
        X[col] = X[col].apply(lambda x: x if x in top_categories else 'Other')
        test_fe[col] = test_fe[col].apply(lambda x: x if x in top_categories else 'Other')
        print(f"Reduced {col} from {len(value_counts)} to 9 categories")

print(f"Numerical columns: {numerical_cols}")
print(f"Categorical columns: {categorical_cols}")

🚀 Optimizing categorical features...
Reduced Item_ID from 1451 to 9 categories
Reduced Item_Store_ID from 4990 to 9 categories
Reduced Item_Type from 16 to 9 categories
Numerical columns: ['Store_Start_Year', 'Item_Price', 'Item_Visibility', 'Item_Weight']
Categorical columns: ['Item_ID', 'Item_Sugar_Content', 'Item_Store_ID', 'Item_Type', 'Store_Location_Type', 'Store_ID', 'Store_Size', 'Store_Type']


In [148]:
X.head(2)

Unnamed: 0,Item_ID,Item_Sugar_Content,Visibility_Category,Item_Store_ID,Store_Start_Year,Item_Price,Item_Visibility,Item_Type,Store_Location_Type,Store_ID,Item_Weight,Store_Size,Store_Type
0,Other,Low Sugar,Low,DRA12_BABATUNJI010,2005,357.54,0.068535,Other,Cluster 3,BABATUNJI010,11.6,Unknown,Grocery Store
1,Other,Low Sugar,Low,DRA12_BABATUNJI013,1994,355.79,0.040912,Other,Cluster 3,BABATUNJI013,11.6,High,Supermarket Type1


In [149]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of x_train: {x_train.shape}")
print(f"Shape of x_test: {x_test.shape}")

Shape of x_train: (3992, 13)
Shape of x_test: (998, 13)


In [150]:
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# OPTIMIZED: Faster Random Forest settings
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=50,      # Reduced from 100 for 2X speed
        max_depth=15,         # Limited depth for speed
        min_samples_split=10, # Increased for speed
        n_jobs=-1,
        random_state=42
    ))
])

model.fit(x_train, y_train)

In [151]:
y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")

RMSE: 3054.616423355668
R2 Score: 0.5154582874207962


In [152]:
# OPTIMIZED: Smarter grid search (3X faster)
param_grid = {
    'regressor__n_estimators': [50, 100],     # Reduced options
    'regressor__max_depth': [10, 15],         # Reduced options
    'regressor__min_samples_split': [5, 10],  # More focused range
}
grid = GridSearchCV(model, param_grid, cv=3, scoring='r2', n_jobs=-1)  # cv=3 instead of 5
grid.fit(x_train, y_train)
print("Best R2:", grid.best_score_)

Best R2: 0.542838742525772


In [153]:
print("Best Parameters:", grid.best_params_)

best_model = grid.best_estimator_
y_pred = best_model.predict(x_test)
print("Test R2:", r2_score(y_test, y_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 100}
Test R2: 0.5254244364687379
Test RMSE: 3023.0392332032116


In [154]:
# OPTIMIZED: Fixed test prediction with consistent columns
test_final = test_fe[common_cols]
test_pred = best_model.predict(test_final)
test_pred

array([1281.94501532, 8081.24881945, 8144.45229147, ..., 9209.05806597,
       1749.16152187, 7586.68809207])

In [155]:
print("🚀 Testing Different Algorithms...")

# Get your preprocessed data ready
X_processed = preprocessor.fit_transform(x_train)
X_test_processed = preprocessor.transform(x_test)

# 1. XGBoost Model
print("\n1️⃣ Training XGBoost...")
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_processed, y_train)
xgb_pred = xgb_model.predict(X_test_processed)
xgb_r2 = r2_score(y_test, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
print(f"XGBoost R2: {xgb_r2:.4f}")
print(f"XGBoost RMSE: {xgb_rmse:.4f}")

🚀 Testing Different Algorithms...

1️⃣ Training XGBoost...
XGBoost R2: 0.5068
XGBoost RMSE: 3081.7268


In [156]:
# 2. LightGBM Model
print("\n2️⃣ Training LightGBM...")
lgb_model = lgb.LGBMRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, verbose=-1)
lgb_model.fit(X_processed, y_train)
lgb_pred = lgb_model.predict(X_test_processed)
lgb_r2 = r2_score(y_test, lgb_pred)
lgb_rmse = np.sqrt(mean_squared_error(y_test, lgb_pred))
print(f"LightGBM R2: {lgb_r2:.4f}")
print(f"LightGBM RMSE: {lgb_rmse:.4f}")


2️⃣ Training LightGBM...
LightGBM R2: 0.5213
LightGBM RMSE: 3036.0476


In [158]:
# 3. CatBoost Model
print("\n3️⃣ Training CatBoost...")
cat_model = CatBoostRegressor(iterations=100, depth=5, learning_rate=0.1, random_state=42, verbose=False)
cat_model.fit(X_processed, y_train)
cat_pred = cat_model.predict(X_test_processed)
cat_r2 = r2_score(y_test, cat_pred)
cat_rmse = np.sqrt(mean_squared_error(y_test, cat_pred))
print(f"CatBoost R2: {cat_r2:.4f}")
print(f"CatBoost RMSE: {cat_rmse:.4f}")


3️⃣ Training CatBoost...
CatBoost R2: 0.5429
CatBoost RMSE: 2966.9907


In [159]:
cat_model = CatBoostRegressor(
    iterations=1488,
    depth=4,
    learning_rate=0.10745798030355086,
    l2_leaf_reg=4.394828825957951,
    border_count=167,
    bagging_temperature=0.17642582012086444,
    random_strength=0.004428635483032767,
    random_state=42,
    verbose=False
)

# Add early stopping and validation for even better performance
cat_model.fit(
    X_processed, y_train,
    eval_set=(X_test_processed, y_test),  # Validation set for early stopping
    early_stopping_rounds=50,             # Stop if no improvement for 50 rounds
    verbose=False
)

cat_pred = cat_model.predict(X_test_processed)
cat_r2 = r2_score(y_test, cat_pred)
cat_rmse = np.sqrt(mean_squared_error(y_test, cat_pred))

print(f"CatBoost R2: {cat_r2:.4f}")
print(f"CatBoost RMSE: {cat_rmse:.4f}")

# Optional: Print feature importance
print(f"\n📊 Model Info:")
print(f"Best iteration: {cat_model.get_best_iteration()}")
print(f"Total iterations run: {cat_model.tree_count_}")

CatBoost R2: 0.5487
CatBoost RMSE: 2947.8143

📊 Model Info:
Best iteration: 39
Total iterations run: 40


In [160]:
# 4. Compare with your Random Forest
rf_pred = best_model.predict(x_test)
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
print(f"\n🌲 Random Forest R2: {rf_r2:.4f}")
print(f"Random Forest RMSE: {rf_rmse:.4f}")


🌲 Random Forest R2: 0.5254
Random Forest RMSE: 3023.0392


In [161]:
# 5. Summary Comparison
print("\n📊 ALGORITHM COMPARISON:")
print("-" * 40)
results = [
    ("Random Forest", rf_r2, rf_rmse),
    ("XGBoost", xgb_r2, xgb_rmse),
    ("LightGBM", lgb_r2, lgb_rmse),
    ("CatBoost", cat_r2, cat_rmse)
]

for name, r2, rmse in sorted(results, key=lambda x: x[1], reverse=True):
    print(f"{name:15} | R2: {r2:.4f} | RMSE: {rmse:.4f}")

# 6. Pick the best model for final predictions
best_algorithm = max(results, key=lambda x: x[1])
print(f"\n🏆 Best Algorithm: {best_algorithm[0]} (R2: {best_algorithm[1]:.4f})")

# 7. Make final predictions with best model
test_processed = preprocessor.transform(test_fe[common_cols])

if best_algorithm[0] == "XGBoost":
    final_predictions = xgb_model.predict(test_processed)
elif best_algorithm[0] == "LightGBM":
    final_predictions = lgb_model.predict(test_processed)
elif best_algorithm[0] == "CatBoost":
    final_predictions = cat_model.predict(test_processed)
else:
    final_predictions = best_model.predict(test_fe[common_cols])

print(f"Final predictions shape: {final_predictions.shape}")


📊 ALGORITHM COMPARISON:
----------------------------------------
CatBoost        | R2: 0.5487 | RMSE: 2947.8143
Random Forest   | R2: 0.5254 | RMSE: 3023.0392
LightGBM        | R2: 0.5213 | RMSE: 3036.0476
XGBoost         | R2: 0.5068 | RMSE: 3081.7268

🏆 Best Algorithm: CatBoost (R2: 0.5487)
Final predictions shape: (3532,)


In [162]:
test_final = test_fe[common_cols]
test_pred = best_model.predict(test_final)
test_pred

array([1281.94501532, 8081.24881945, 8144.45229147, ..., 9209.05806597,
       1749.16152187, 7586.68809207])

In [163]:
test_data.head(2)

Unnamed: 0,Item_ID,Store_ID,Item_Store_ID,Item_Weight,Item_Sugar_Content,Item_Visibility,Item_Type,Item_Price,Store_Start_Year,Store_Size,Store_Location_Type,Store_Type
0,DRA59,BABATUNJI010,DRA59_BABATUNJI010,8.27,Normal Sugar,0.214125,Soft Drinks,459.98,2005,,Cluster 3,Grocery Store
1,DRA59,BABATUNJI013,DRA59_BABATUNJI013,8.27,Normal Sugar,0.127821,Soft Drinks,464.98,1994,High,Cluster 3,Supermarket Type1


In [164]:
submission = pd.DataFrame({
    'Item_Store_ID': test_data['Item_Store_ID'],
    'Item_Store_Returns': test_pred
})
submission.to_csv('submission.csv', index=False)

In [165]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>