In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv
/kaggle/input/playground-series-s5e6/sample_submission.csv
/kaggle/input/playground-series-s5e6/train.csv
/kaggle/input/playground-series-s5e6/test.csv


In [2]:
import pandas as pd
import numpy as np
import os
import time
import warnings
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
warnings.filterwarnings('ignore')

In [14]:
train= pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test= pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
original = pd.read_csv('/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv')
submission= pd.read_csv('/kaggle/input/playground-series-s5e6/sample_submission.csv')

In [4]:
for df in (train, test, original):
    if 'id' in df.columns:
        df.drop('id', axis=1, inplace=True)

train = pd.concat([train, original], ignore_index=True)

In [5]:
cat_cols = [col for col in train.select_dtypes(include=['object', 'category']).columns if col != "Fertilizer Name"]
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

In [6]:
f_label_enc = LabelEncoder()
train["Fertilizer Name"] = f_label_enc.fit_transform(train["Fertilizer Name"])

In [7]:
from scipy.stats import skew
# Log transform skewed features
numerical_cols = train.select_dtypes(include=[np.number]).columns
skewed_features = train[numerical_cols].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
high_skew = skewed_features[abs(skewed_features) > 0.5].index
train[high_skew] = np.log1p(train[high_skew])
test[high_skew] = np.log1p(test[high_skew])

In [8]:
test

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,31,70,52,4,10,34,11,24
1,27,62,45,3,8,30,14,15
2,28,72,28,1,2,14,15,4
3,37,53,57,0,2,18,17,36
4,31,55,32,3,7,13,19,14
...,...,...,...,...,...,...,...,...
249995,26,66,30,3,8,14,7,18
249996,33,62,55,3,7,28,14,7
249997,36,53,64,0,6,28,11,27
249998,36,67,26,1,6,33,0,10


In [9]:
X = train.drop(columns=["Fertilizer Name"])
y = train["Fertilizer Name"]
X_test = test

In [10]:
def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        p = p[:k]
        score = 0.0
        hits = 0.0
        seen = set()
        for i, pred in enumerate(p):
            if pred in a and pred not in seen:
                hits += 1.0
                score += hits / (i + 1.0)
                seen.add(pred)
        return score / min(len(a), k)

    actual_wrapped = [[a] for a in actual]
    return np.mean([apk(a, p, k) for a, p in zip(actual_wrapped, predicted)])

In [11]:
%%time

FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

num_classes = y.nunique()
oof = np.zeros((len(train), num_classes))
pred_prob = np.zeros((len(test), num_classes))

xgb_params = {
    'objective': 'multi:softprob',
    'num_class': num_classes,
    'max_depth': 7,
    'learning_rate': 0.03,
    'subsample': 0.8,
    'max_bin': 128,
    'colsample_bytree': 0.3,
    'colsample_bylevel': 1,
    'colsample_bynode': 1,
    'tree_method': 'gpu_hist',   # Use 'hist' for faster training; change to 'gpu_hist' if CUDA gives issues
    'random_state': 42,
    'eval_metric': 'mlogloss',
    'device': 'cuda',
    'n_estimators': 10000,
    'early_stopping_rounds': 50
}



for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
    print(f"{'='*10} Fold {fold} {'='*10}")
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
    
    oof[valid_idx] = model.predict_proba(X_valid)
    pred_prob += model.predict_proba(X_test) / FOLDS  # Average predictions over folds

    # MAP@3 evaluation
    top_3_preds = np.argsort(oof[valid_idx], axis=1)[:, -3:][:, ::-1]
    actual = [[label] for label in y_valid]
    map3_score = mapk(actual, top_3_preds)
    print(f"✅ Fold {fold} MAP@3 Score: {map3_score:.5f}")

# Optional: Final evaluation on entire out-of-fold predictions
logloss_score = log_loss(y, oof)
print(f"\n📊 Overall Log Loss: {logloss_score:.5f}")

✅ Fold 1 MAP@3 Score: 0.36304
✅ Fold 2 MAP@3 Score: 0.36274
✅ Fold 3 MAP@3 Score: 0.36266
✅ Fold 4 MAP@3 Score: 0.36195
✅ Fold 5 MAP@3 Score: 0.36161

📊 Overall Log Loss: 1.89402
CPU times: user 27min 33s, sys: 1min 4s, total: 28min 37s
Wall time: 28min 25s


In [15]:
top_3_preds = np.argsort(pred_prob, axis=1)[:, -3:][:, ::-1]
top_3_labels = f_label_enc.inverse_transform(top_3_preds.ravel()).reshape(top_3_preds.shape)
submission = pd.DataFrame({
    'id':  submission.id,
    'Fertilizer Name': [' '.join(row) for row in top_3_labels]
})

In [16]:
submission.to_csv('submission.csv', index=False)
print("✅ Submission file saved as 'submission.csv'")

✅ Submission file saved as 'submission.csv'
