In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

# Load data
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
sample_submission = pd.read_csv('/kaggle/input/ieee-fraud-detection/sample_submission.csv')

# Merge transaction and identity tables on TransactionID
train_df = train_transaction.merge(train_identity, on='TransactionID', how='left')
test_df = test_transaction.merge(test_identity, on='TransactionID', how='left')

# Free up memory by deleting unused dataframes
del train_transaction, train_identity, test_transaction, test_identity

# Separate target and drop from train data
y = train_df['isFraud']
X = train_df.drop(['isFraud', 'TransactionID'], axis=1)
test_df = test_df.drop(['TransactionID'], axis=1)

# Handle missing values by filling with a constant
X.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

# Encode categorical features
cat_cols = [col for col in X.columns if X[col].dtype == 'object']
for col in cat_cols:
    le = LabelEncoder()
    le.fit(list(X[col].astype(str).values) + list(test_df[col].astype(str).values))
    X[col] = le.transform(list(X[col].astype(str).values))
    test_df[col] = le.transform(list(test_df[col].astype(str).values))

# LightGBM parameters
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'n_estimators': 10000,
    'learning_rate': 0.05,
    'num_leaves': 256,
    'max_depth': -1,
    'subsample': 0.8,
    'colsample_bytree': 0.4,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_split_gain': 0.01,
    'min_child_weight': 2,
    'verbose': -1,
    'is_unbalance': True
}

# KFold cross-validation
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()
features = X.columns

for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
    print(f"Training fold {fold + 1}")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # LightGBM Dataset formatting
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Train the model
    clf = lgb.train(params, train_data, valid_sets=[train_data, val_data], 
                    verbose_eval=500, early_stopping_rounds=100)
    
    # Predictions
    val_pred = clf.predict(X_val, num_iteration=clf.best_iteration)
    print(f"Fold {fold + 1} AUC: {roc_auc_score(y_val, val_pred)}")
    
    predictions += clf.predict(test_df, num_iteration=clf.best_iteration) / folds.n_splits

# Prepare submission
sample_submission['isFraud'] = predictions
sample_submission.to_csv('submission.csv', index=False)
print("Submission file created successfully.")
