In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.metrics import classification_report, f1_score
import xgboost as xgb
from datetime import datetime
from sklearn.decomposition import PCA
import optuna

# Define data preparation function (same as above)
def prepare_data(df):
    """
    Enhanced data preparation with advanced feature engineering
    """
    data = df.copy()
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns

    for col in numeric_cols:
        data[col] = data[col].fillna(data[col].median())
    for col in categorical_cols:
        data[col] = data[col].fillna(data[col].mode()[0])

    date_cols = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                 'released_date', 'estimated_delivery_date', 'received_date']

    for col in date_cols:
        if col in data.columns:
            data[col] = pd.to_datetime(data[col], errors='coerce')
            data[f'{col}_year'] = data[col].dt.year.fillna(-1).astype(int)
            data[f'{col}_month'] = data[col].dt.month.fillna(-1).astype(int)
            data[f'{col}_day'] = data[col].dt.day.fillna(-1).astype(int)

    for col in date_cols:
        if col in data.columns:
            data = data.drop(columns=[col])

    if 'Product_value' in data.columns and 'final_payment' in data.columns:
        data['discount_amount'] = data['Product_value'] - data['final_payment']
        data['discount_percentage'] = (data['discount_amount'] / data['Product_value'] * 100).clip(0, 100)

    if 'loyalty_points_redeemed' in data.columns:
        data['has_redeemed_points'] = (data['loyalty_points_redeemed'] > 0).astype(int)

    le_dict = {}
    categorical_cols = ['Gender', 'Is_current_loyalty_program_member', 'loyalty_tier',
                        'payment_method', 'purchase_medium', 'shipping_method',
                        'product_category']
    existing_cat_cols = [col for col in categorical_cols if col in data.columns]
    for col in existing_cat_cols:
        le_dict[col] = LabelEncoder()
        data[col] = le_dict[col].fit_transform(data[col].astype(str))

    return data

# Load datasets
print("Loading datasets...")
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

X_train_full = prepare_data(train_data)
X_test = prepare_data(test_data)
cols_to_drop = ['customer_experience', 'user_id', 'transaction_id', 'order_id', 'tracking_number']
feature_cols = [col for col in X_train_full.columns if col not in cols_to_drop]
X = X_train_full[feature_cols]
y = train_data['customer_experience']

scaler = StandardScaler()
power = PowerTransformer(method='yeo-johnson')
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
X_numeric = X[numeric_features]
X_numeric_scaled = scaler.fit_transform(X_numeric)
X_numeric_transformed = power.fit_transform(X_numeric_scaled)
X[numeric_features] = X_numeric_transformed

pca = PCA(n_components=10)
pca_features = pca.fit_transform(X_numeric_transformed)
for i in range(pca_features.shape[1]):
    X[f'pca_feature_{i}'] = pca_features[:, i]

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
X_test_numeric = X_test[numeric_features]
X_test_numeric_scaled = scaler.transform(X_test_numeric)
X_test_numeric_transformed = power.transform(X_test_numeric_scaled)
X_test[numeric_features] = X_test_numeric_transformed
test_pca_features = pca.transform(X_test_numeric_transformed)
for i in range(test_pca_features.shape[1]):
    X_test[f'pca_feature_{i}'] = test_pca_features[:, i]

# Define objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'alpha': trial.suggest_float('alpha', 0.0, 1.0),
        'lambda': trial.suggest_float('lambda', 0.0, 1.0),
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'random_state': 42,
        'n_jobs': -1,
        'tree_method': 'hist',
        'early_stopping_rounds':50
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False )
    y_pred = model.predict(X_val)
    return f1_score(y_val, y_pred, average='weighted')

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Best parameters
print("\nBest parameters found:")
print(study.best_params)

# Train final model with best parameters
best_params = study.best_params
best_params.update({'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'random_state': 42, 'n_jobs': -1})
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

# Make predictions
print("\nMaking predictions on validation set...")
y_pred = final_model.predict(X_val)
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score: {weighted_f1:.4f}")

print("\nMaking predictions on test set...")
test_predictions = final_model.predict(X_test)
test_predictions_labels = le.inverse_transform(test_predictions)

submission = pd.DataFrame({
    'id': range(len(test_predictions_labels)),
    'customer_experience': test_predictions_labels
})

submission.to_csv('submission8.csv', index=False)
print("\nSubmission file created successfully!")

Loading datasets...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_features] = X_numeric_transformed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[f'pca_feature_{i}'] = pca_features[:, i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[f'pca_feature_{i}'] = pca_features[:, i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin


Best parameters found:
{'n_estimators': 589, 'max_depth': 10, 'learning_rate': 0.0685688572146235, 'subsample': 0.737748940258184, 'colsample_bytree': 0.9777895708940965, 'min_child_weight': 5, 'gamma': 0.45782275945876516, 'alpha': 0.11009033325257964, 'lambda': 0.2869917733819016}
[0]	validation_0-mlogloss:1.08352
[1]	validation_0-mlogloss:1.06995
[2]	validation_0-mlogloss:1.05581
[3]	validation_0-mlogloss:1.04357
[4]	validation_0-mlogloss:1.03184
[5]	validation_0-mlogloss:1.02103
[6]	validation_0-mlogloss:1.01136
[7]	validation_0-mlogloss:1.00216
[8]	validation_0-mlogloss:0.99357
[9]	validation_0-mlogloss:0.98561
[10]	validation_0-mlogloss:0.97850
[11]	validation_0-mlogloss:0.97177
[12]	validation_0-mlogloss:0.96597
[13]	validation_0-mlogloss:0.96222
[14]	validation_0-mlogloss:0.95658
[15]	validation_0-mlogloss:0.95129
[16]	validation_0-mlogloss:0.94654
[17]	validation_0-mlogloss:0.94167
[18]	validation_0-mlogloss:0.93719
[19]	validation_0-mlogloss:0.93332
[20]	validation_0-mloglos