In [5]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
# 1. Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# 2. Tag datasets
train['source'] = 'train'
test['source'] = 'test'
test['Item_Store_Returns'] = np.nan  # Placeholder

In [4]:
# 3. Combine datasets
data = pd.concat([train, test], ignore_index=True)

In [5]:
data.head()

Unnamed: 0,Item_ID,Store_ID,Item_Store_ID,Item_Weight,Item_Sugar_Content,Item_Visibility,Item_Type,Item_Price,Store_Start_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Returns,source
0,DRA12,BABATUNJI010,DRA12_BABATUNJI010,11.6,Low Sugar,0.068535,Soft Drinks,357.54,2005,,Cluster 3,Grocery Store,709.08,train
1,DRA12,BABATUNJI013,DRA12_BABATUNJI013,11.6,Low Sugar,0.040912,Soft Drinks,355.79,1994,High,Cluster 3,Supermarket Type1,6381.69,train
2,DRA12,BABATUNJI017,DRA12_BABATUNJI017,11.6,Low Sugar,0.041178,Soft Drinks,350.79,2014,,Cluster 2,Supermarket Type1,6381.69,train
3,DRA12,BABATUNJI018,DRA12_BABATUNJI018,11.6,Low Sugar,0.041113,Soft Drinks,355.04,2016,Medium,Cluster 3,Supermarket Type2,2127.23,train
4,DRA12,BABATUNJI035,DRA12_BABATUNJI035,11.6,Ultra Low Sugar,0.0,Soft Drinks,354.79,2011,Small,Cluster 2,Supermarket Type1,2481.77,train


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8522 entries, 0 to 8521
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Item_ID              8522 non-null   object 
 1   Store_ID             8522 non-null   object 
 2   Item_Store_ID        8522 non-null   object 
 3   Item_Weight          7059 non-null   float64
 4   Item_Sugar_Content   8522 non-null   object 
 5   Item_Visibility      8522 non-null   float64
 6   Item_Type            8522 non-null   object 
 7   Item_Price           8522 non-null   float64
 8   Store_Start_Year     8522 non-null   int64  
 9   Store_Size           6113 non-null   object 
 10  Store_Location_Type  8522 non-null   object 
 11  Store_Type           8522 non-null   object 
 12  Item_Store_Returns   4990 non-null   float64
 13  source               8522 non-null   object 
dtypes: float64(4), int64(1), object(9)
memory usage: 932.2+ KB


In [7]:
data.isnull().sum()

Item_ID                   0
Store_ID                  0
Item_Store_ID             0
Item_Weight            1463
Item_Sugar_Content        0
Item_Visibility           0
Item_Type                 0
Item_Price                0
Store_Start_Year          0
Store_Size             2409
Store_Location_Type       0
Store_Type                0
Item_Store_Returns     3532
source                    0
dtype: int64

In [8]:
data.duplicated().sum()

0

In [9]:
# 4. Fill missing values
numeric_cols = ['Item_Weight', 'Item_Visibility', 'Item_Price']
for col in numeric_cols:
    data[col].fillna(data[col].mean(), inplace=True)

data['Store_Size'].fillna(data['Store_Size'].mode()[0], inplace=True)

In [10]:
data.isnull().sum()

Item_ID                   0
Store_ID                  0
Item_Store_ID             0
Item_Weight               0
Item_Sugar_Content        0
Item_Visibility           0
Item_Type                 0
Item_Price                0
Store_Start_Year          0
Store_Size                0
Store_Location_Type       0
Store_Type                0
Item_Store_Returns     3532
source                    0
dtype: int64

In [11]:
# 5. Feature Engineering
data['Store_Age'] = 2025 - data['Store_Start_Year']
data['Price_Visibility_Ratio'] = data['Item_Price'] / (data['Item_Visibility'] + 1e-5)
data['Item_Visibility'] = data['Item_Visibility'].clip(upper=0.5)  # cap outliers
data['Item_Price_per_Weight'] = data['Item_Price'] / data['Item_Weight']
data['Store_Exposure'] = data['Item_Visibility'] * data['Store_Age']
data['Item_Weight_Category'] = pd.cut(data['Item_Weight'], bins=[0, 5, 10, 20, 100],
                                      labels=['Very_Light', 'Light', 'Medium', 'Heavy'])


In [12]:
data.head()

Unnamed: 0,Item_ID,Store_ID,Item_Store_ID,Item_Weight,Item_Sugar_Content,Item_Visibility,Item_Type,Item_Price,Store_Start_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Returns,source,Store_Age,Price_Visibility_Ratio,Item_Price_per_Weight,Store_Exposure,Item_Weight_Category
0,DRA12,BABATUNJI010,DRA12_BABATUNJI010,11.6,Low Sugar,0.068535,Soft Drinks,357.54,2005,Medium,Cluster 3,Grocery Store,709.08,train,20,5216.132,30.822414,1.370701,Medium
1,DRA12,BABATUNJI013,DRA12_BABATUNJI013,11.6,Low Sugar,0.040912,Soft Drinks,355.79,1994,High,Cluster 3,Supermarket Type1,6381.69,train,31,8694.383,30.671552,1.268267,Medium
2,DRA12,BABATUNJI017,DRA12_BABATUNJI017,11.6,Low Sugar,0.041178,Soft Drinks,350.79,2014,Medium,Cluster 2,Supermarket Type1,6381.69,train,11,8516.903,30.240517,0.452953,Medium
3,DRA12,BABATUNJI018,DRA12_BABATUNJI018,11.6,Low Sugar,0.041113,Soft Drinks,355.04,2016,Medium,Cluster 3,Supermarket Type2,2127.23,train,9,8633.676,30.606897,0.370014,Medium
4,DRA12,BABATUNJI035,DRA12_BABATUNJI035,11.6,Ultra Low Sugar,0.0,Soft Drinks,354.79,2011,Small,Cluster 2,Supermarket Type1,2481.77,train,14,35479000.0,30.585345,0.0,Medium


In [13]:
# 6. Encode categorical variables
categorical_cols = [
    'Item_ID', 'Store_ID', 'Item_Store_ID', 'Item_Sugar_Content',
    'Item_Type', 'Store_Size', 'Store_Location_Type', 'Store_Type'
]

In [14]:
# Label encode high-cardinality IDs, one-hot for others
label_enc = LabelEncoder()
for col in ['Item_ID', 'Store_ID', 'Item_Store_ID']:
    data[col] = label_enc.fit_transform(data[col].astype(str))

data = pd.get_dummies(data, columns=[
    'Item_Sugar_Content', 'Item_Type', 'Store_Size', 'Store_Location_Type', 'Store_Type'
])

In [15]:
# 7. Separate back to train/test
train_final = data[data['source'] == 'train'].drop(['source'], axis=1)
test_final = data[data['source'] == 'test'].drop(['source', 'Item_Store_Returns'], axis=1)

In [16]:
# Define X and y
X = train_final.drop('Item_Store_Returns', axis=1)
y = np.log1p(train_final['Item_Store_Returns'].astype(float))  # log1p transform

In [None]:
# Fixed Task Type (hardcoded to regression)
task_type = 'regression'
print(f"Detected Task Type: {task_type.upper()}")

In [27]:
# 10. Train/Test split for local validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
print("y_train dtype:", y_train.dtype)


y_train dtype: float64


In [29]:
print("Total missing values in X_train:", X_train.isnull().sum().sum())

Total missing values in X_train: 0


In [None]:
#GridSearchCV setup for regression only
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print(f"Best Params: {grid.best_params_}")

In [60]:
# 12. Validation score
y_pred_val = best_model.predict(X_val)
if task_type == 'regression':
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    print(f"Validation RMSE: {rmse:.3f}")
else:
    acc = accuracy_score(y_val, y_pred_val)
    print(f"Validation Accuracy: {acc:.3f}")


Validation RMSE: 2983.948


In [61]:
# 13. Cross-validation (optional)
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error' if task_type == 'regression' else 'accuracy')
print("CV Score:", (-cv_scores.mean()) if task_type == 'regression' else cv_scores.mean())


CV Score: 9290647.220463531


In [62]:
# 14. Final prediction on test
predictions = best_model.predict(test_final)


In [63]:
# 15. Output to CSV
output = pd.DataFrame({
    'Item_Store_ID': test['Item_Store_ID'],
    'Predicted_Returns': predictions
})
output.to_csv('submission.csv', index=False)
print("Predictions saved to 'submission.csv'")

Predictions saved to 'submission.csv'


In [None]:








# 5. Separate train/test
train_final = data[data['source'] == 'train'].drop(['source'], axis=1)
test_final = data[data['source'] == 'test'].drop(['source', 'Item_Store_Returns'], axis=1)

X = train_final.drop('Item_Store_Returns', axis=1)
y = np.log1p(train_final['Item_Store_Returns'].astype(float))  # log1p transform

# 6. Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# 7. XGBoost model
model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=20, verbose=False)


In [None]:

# 8. Predict and evaluate
val_preds = model.predict(X_val)
rmse = mean_squared_error(np.expm1(y_val), np.expm1(val_preds), squared=False)
print("Improved RMSE on validation set:", round(rmse, 4))

# 9. Retrain on full data and predict test
model.fit(X, y)
test_preds = np.expm1(model.predict(test_final))  # Inverse log1p


In [None]:
# 10. Save predictions
submission = pd.DataFrame({
    'Item_Store_ID': test['Item_Store_ID'],
    'Predicted_Returns': test_preds
})
submission.to_csv('submission_xgboost.csv', index=False)

print("Submission saved as 'submission_xgboost.csv'")


In [None]:
#GridSearchCV setup for regression only
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print(f"Best Params: {grid.best_params_}")