In [3]:
#preparing the data

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

# Step 1: Load data
train_df = pd.read_csv("Train.csv")
test_df_original = pd.read_csv("Test.csv")  # Keep original for final output
submission_df = pd.read_csv("Submission.csv")  # contains actuals

# Step 2: Merge actual sales with test data for training/evaluation
test_df = pd.merge(test_df_original.copy(), submission_df[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']],
                   on=['Item_Identifier', 'Outlet_Identifier'])

# Step 3: Combine for preprocessing
train_df['source'] = 'train'
test_df['source'] = 'test'
combined = pd.concat([train_df, test_df], ignore_index=True)

# Step 4: Data cleaning
combined['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}, inplace=True)
combined['Item_Weight'].fillna(combined['Item_Weight'].mean(), inplace=True)
combined['Outlet_Size'].fillna(combined['Outlet_Size'].mode()[0], inplace=True)

# Step 5: Feature engineering
combined['Outlet_Age'] = 2025 - combined['Outlet_Establishment_Year']

# Step 6: Encode categorical variables
le = LabelEncoder()
for col in ['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Identifier', 'Item_Type']:
    combined[col] = le.fit_transform(combined[col])

# Step 7: Drop unused columns
combined.drop(['Item_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)

# Step 8: Split back to train and test sets
train_processed = combined[combined['source'] == 'train'].drop(columns='source')
test_processed = combined[combined['source'] == 'test'].drop(columns='source')

X_train = train_processed.drop('Item_Outlet_Sales', axis=1)
y_train = train_processed['Item_Outlet_Sales']
X_test = test_processed.drop('Item_Outlet_Sales', axis=1)
y_test = test_processed['Item_Outlet_Sales']  # from submission file

# Step 9: Train model
model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

# Step 10: Predict and evaluate
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f" Model Evaluation on Actual Test Data:")
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# Step 11: Save predictions
# Match actuals from submission file
final_output = test_df_original[['Item_Identifier', 'Outlet_Identifier']].copy()
final_output = pd.merge(final_output, submission_df, on=['Item_Identifier', 'Outlet_Identifier'])
final_output.rename(columns={'Item_Outlet_Sales': 'Actual_Sales'}, inplace=True)
final_output['Predicted_Sales'] = y_pred
final_output['Error'] = abs(final_output['Actual_Sales'] - final_output['Predicted_Sales'])

final_output.to_csv("XGBoost_Sales_Predictions_Final.csv", index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined['Item_Weight'].fillna(combined['Item_Weight'].mean(), inplace=True)
The behavior will change in pandas 3.0. This 

 Model Evaluation on Actual Test Data:
R² Score: 0.9390
MAE: 242.68
RMSE: 352.45




In [3]:
import joblib

# Save the trained model
joblib.dump(model, 'xgb_sales_model.pkl')

# Save the feature column order for prediction consistency
feature_columns = X_train.columns.tolist()
joblib.dump(feature_columns, 'xgb_feature_columns.pkl')


['xgb_feature_columns.pkl']

In [None]:
mean_actual = final_output['Actual_Sales'].mean()
mean_error = final_output['Error'].mean()
mean_absolute_percentage_error = (mean_error / mean_actual) * 100
accuracy = 100 - mean_absolute_percentage_error
 
print(f"\nModel Accuracy Metrics:")
print(f"Mean Actual Sales: {mean_actual:.2f}")
print(f"Mean Absolute Error: {mean_error:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mean_absolute_percentage_error:.2f}%")
print(f"Model Accuracy (100 - MAPE): {accuracy:.2f}%")