In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-challenge/test_data.parquet
/kaggle/input/amex-challenge/add_event.parquet
/kaggle/input/amex-challenge/685404e30cfdb_submission_template.csv
/kaggle/input/amex-challenge/data_dictionary.csv
/kaggle/input/amex-challenge/offer_metadata.parquet
/kaggle/input/amex-challenge/add_trans.parquet
/kaggle/input/amex-challenge/train_data.parquet


In [2]:
# STEP 0: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold, KFold
import lightgbm as lgb
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer
import gc

In [3]:
# STEP 1: Load Data
base = '/kaggle/input/amex-challenge/'
train = pd.read_parquet(base + 'train_data.parquet')
test = pd.read_parquet(base + 'test_data.parquet')
events = pd.read_parquet(base + 'add_event.parquet')
trans = pd.read_parquet(base + 'add_trans.parquet')
offers = pd.read_parquet(base + 'offer_metadata.parquet')

In [4]:
# Standardize IDs
for df in [train, test, events, trans]:
    df['id2'] = df['id2'].astype(str)
    if 'id3' in df.columns:
        df['id3'] = df['id3'].astype(str)
offers['id3'] = offers['id3'].astype(str)

# Merge offer metadata
train = train.merge(offers, on='id3', how='left')
test = test.merge(offers, on='id3', how='left')

In [5]:
# Aggregate transaction features
agg_trans = trans.groupby('id2')['f367'].agg(['sum', 'mean', 'count']).reset_index()
agg_trans.columns = ['id2', 'total_spend', 'avg_spend', 'txn_count']
train = train.merge(agg_trans, on='id2', how='left')
test = test.merge(agg_trans, on='id2', how='left')
# Event interaction rate
events['click_flag'] = events['id7'].notnull().astype(int)
click_rate = events.groupby('id2')['click_flag'].mean().reset_index(name='click_rate')
train = train.merge(click_rate, on='id2', how='left')
test = test.merge(click_rate, on='id2', how='left')

In [6]:
# Temporal features
for df in [train, test]:
    df['id4'] = pd.to_datetime(df['id4'], errors='coerce')
    df['day'] = df['id4'].dt.dayofweek
    df['hour'] = df['id4'].dt.hour
    df['recency'] = (df['id4'].max() - df['id4']).dt.days
    df['id5'] = df['id4'].dt.strftime('%m/%d/%y')  # MM/DD/YY for submission
# TF-IDF on offer body (f378)
tfidf = TfidfVectorizer(max_features=50)
tfidf_train = tfidf.fit_transform(train['f378'].fillna(''))
tfidf_test = tfidf.transform(test['f378'].fillna(''))
tfidf_train_df = pd.DataFrame(tfidf_train.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_train.shape[1])])
tfidf_test_df = pd.DataFrame(tfidf_test.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_test.shape[1])])
train = pd.concat([train.reset_index(drop=True), tfidf_train_df], axis=1)
test = pd.concat([test.reset_index(drop=True), tfidf_test_df], axis=1)

In [7]:
import pandas as pd
from sklearn.model_selection import KFold

# Convert y to numeric and drop any rows where conversion fails
train['y'] = pd.to_numeric(train['y'], errors='coerce')
train = train.dropna(subset=['y'])
train['y'] = train['y'].astype(int)


In [8]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
train['id3_te'] = 0.0  # Use float for encoding

for tr_idx, val_idx in kf.split(train):
    means = train.iloc[tr_idx].groupby('id3')['y'].mean()
    train.loc[val_idx, 'id3_te'] = train.loc[val_idx, 'id3'].map(means)

# For test set, use global means from full train
test['id3_te'] = test['id3'].map(train.groupby('id3')['y'].mean())

# Fill missing encodings with global mean
train['id3_te'].fillna(train['y'].mean(), inplace=True)
test['id3_te'].fillna(train['y'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['id3_te'].fillna(train['y'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['id3_te'].fillna(train['y'].mean(), inplace=True)


In [9]:
for df in [train, test]:
    # Spend × click rate
    df['spend_click'] = df['total_spend'] * df['click_rate']
    # Offer popularity: number of customers per offer
    df['offer_pop'] = df.groupby('id3')['id2'].transform('count')
    # Customer frequency: number of offers per customer
    df['cust_freq'] = df.groupby('id2')['id3'].transform('count')


In [10]:
# Exclude non-feature columns
exclude = ['id1', 'id2', 'id3', 'id4', 'id5', 'id12', 'id13', 'f378', 'start', 'end']
features = [col for col in train.columns if col not in exclude + ['y']]

# Ensure all features are numeric
for col in features:
    train[col] = pd.to_numeric(train[col], errors='coerce')
    test[col] = pd.to_numeric(test[col], errors='coerce')

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

X = train[features]
y = train['y']
X_test = test[features]


In [11]:
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
import numpy as np
import gc

gkf = GroupKFold(n_splits=5)
preds = np.zeros(len(test))
train['pred'] = 0

params = {
    'objective': 'lambdarank',
    'metric': 'map',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 64,
    'min_data_in_leaf': 50,
    'max_depth': 8,
    'verbosity': -1,
    'device': 'gpu'
}

for fold, (tr_idx, val_idx) in enumerate(gkf.split(X, y, groups=train['id2'])):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx].astype(float)
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx].astype(float)
    group_tr = train.iloc[tr_idx].groupby('id2').size().values
    group_val = train.iloc[val_idx].groupby('id2').size().values

    dtrain = lgb.Dataset(X_tr, y_tr, group=group_tr)
    dval = lgb.Dataset(X_val, y_val, group=group_val)
    model = lgb.train(params, dtrain, valid_sets=[dval], num_boost_round=1000,
                      callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)])
    train.loc[val_idx, 'pred'] = model.predict(X_val)
    preds += model.predict(X_test) / gkf.get_n_splits()
    gc.collect()


  train['pred'] = 0


Training until validation scores don't improve for 50 rounds
[100]	valid_0's map@1: 0.927927	valid_0's map@2: 0.927095	valid_0's map@3: 0.929144	valid_0's map@4: 0.930311	valid_0's map@5: 0.931352
Early stopping, best iteration is:
[108]	valid_0's map@1: 0.928142	valid_0's map@2: 0.927175	valid_0's map@3: 0.929341	valid_0's map@4: 0.930594	valid_0's map@5: 0.93164


 -2.37773633]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train.loc[val_idx, 'pred'] = model.predict(X_val)


Training until validation scores don't improve for 50 rounds
[100]	valid_0's map@1: 0.936412	valid_0's map@2: 0.933351	valid_0's map@3: 0.934178	valid_0's map@4: 0.934584	valid_0's map@5: 0.935279
Early stopping, best iteration is:
[97]	valid_0's map@1: 0.93609	valid_0's map@2: 0.933512	valid_0's map@3: 0.934225	valid_0's map@4: 0.93462	valid_0's map@5: 0.935226
Training until validation scores don't improve for 50 rounds
[100]	valid_0's map@1: 0.934801	valid_0's map@2: 0.933002	valid_0's map@3: 0.934243	valid_0's map@4: 0.935456	valid_0's map@5: 0.935885
[200]	valid_0's map@1: 0.935553	valid_0's map@2: 0.933405	valid_0's map@3: 0.934661	valid_0's map@4: 0.935572	valid_0's map@5: 0.935944
Early stopping, best iteration is:
[158]	valid_0's map@1: 0.935768	valid_0's map@2: 0.933566	valid_0's map@3: 0.934697	valid_0's map@4: 0.935835	valid_0's map@5: 0.936317
Training until validation scores don't improve for 50 rounds
[100]	valid_0's map@1: 0.931257	valid_0's map@2: 0.931579	valid_0's ma

In [12]:
# Prepare submission DataFrame
submission = test[['id1', 'id2', 'id3', 'id5']].copy()
submission['pred'] = preds

# Normalize predictions per group (id2) for ranking
submission['pred'] = submission.groupby('id2')['pred'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min() + 1e-9)
)

# Ensure id5 is in MM/DD/YY format (if not already)
submission['id5'] = pd.to_datetime(submission['id5'], errors='coerce').dt.strftime('%m/%d/%y')

# Save CSV
submission.to_csv('final_submission.csv', index=False)
print("✅ Final submission file saved with normalized prediction scores.")


  submission['id5'] = pd.to_datetime(submission['id5'], errors='coerce').dt.strftime('%m/%d/%y')


✅ Final submission file saved with normalized prediction scores.
