In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, f1_score
import xgboost as xgb
from datetime import datetime

def prepare_data(df):
    """
    Enhanced data preparation with more feature engineering
    """
    # Create a copy to avoid modifying original data
    data = df.copy()
    
    # Fill missing values with appropriate strategies
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    
    # Fill numeric columns with median
    for col in numeric_cols:
        data[col] = data[col].fillna(data[col].median())
    
    # Fill categorical columns with mode
    for col in categorical_cols:
        data[col] = data[col].fillna(data[col].mode()[0])
    
    # Convert date columns to datetime and extract features
    date_cols = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                 'released_date', 'estimated_delivery_date', 'received_date']
    
    for col in date_cols:
        if col in data.columns:
            data[col] = pd.to_datetime(data[col], errors='coerce')
            # Extract numerical features from dates
            data[f'{col}_year'] = data[col].dt.year.fillna(-1).astype(int)
            data[f'{col}_month'] = data[col].dt.month.fillna(-1).astype(int)
            data[f'{col}_day'] = data[col].dt.day.fillna(-1).astype(int)
            data[f'{col}_dayofweek'] = data[col].dt.dayofweek.fillna(-1).astype(int)
            data[f'{col}_quarter'] = data[col].dt.quarter.fillna(-1).astype(int)
            data[f'{col}_is_weekend'] = (data[col].dt.dayofweek >= 5).astype(int)
    
    # Calculate time differences between dates
    if 'payment_datetime' in data.columns and 'purchased_datetime' in data.columns:
        data['payment_purchase_diff'] = (data['payment_datetime'] - data['purchased_datetime']).dt.total_seconds() / 3600
    
    if 'estimated_delivery_date' in data.columns and 'received_date' in data.columns:
        data['delivery_delay'] = (data['received_date'] - data['estimated_delivery_date']).dt.total_seconds() / (24*3600)
    
    # Drop original date columns
    for col in date_cols:
        if col in data.columns:
            data = data.drop(columns=[col])
    
    # Create price-related features
    if 'Product_value' in data.columns and 'final_payment' in data.columns:
        data['discount_amount'] = data['Product_value'] - data['final_payment']
        data['discount_percentage'] = (data['discount_amount'] / data['Product_value'] * 100).clip(0, 100)
        data['price_tier'] = pd.qcut(data['Product_value'], q=5, labels=[1,2,3,4,5], duplicates='drop')
    
    # Create loyalty-related features
    if 'loyalty_points_redeemed' in data.columns:
        data['has_redeemed_points'] = (data['loyalty_points_redeemed'] > 0).astype(int)
    
    # Aggregate discount features
    discount_cols = [col for col in data.columns if 'discount_percentage' in col]
    if discount_cols:
        data['total_discount'] = data[discount_cols].sum(axis=1)
    
    # Encode categorical variables
    categorical_cols = ['Gender', 'Is_current_loyalty_program_member', 'loyalty_tier',
                       'payment_method', 'purchase_medium', 'shipping_method',
                       'product_category']
    
    # Only encode categorical columns that exist in the dataset
    existing_cat_cols = [col for col in categorical_cols if col in data.columns]
    
    # Use LabelEncoder for categorical variables
    le_dict = {}
    for col in existing_cat_cols:
        le_dict[col] = LabelEncoder()
        data[col] = le_dict[col].fit_transform(data[col].astype(str))
    
    return data

# Load datasets
print("Loading datasets...")
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

print("\nTrain data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

# Prepare features
print("\nPreparing features...")
X_train_full = prepare_data(train_data)
X_test = prepare_data(test_data)

# Remove non-feature columns from training data
cols_to_drop = ['customer_experience', 'user_id', 'transaction_id', 
                'order_id', 'tracking_number']
feature_cols = [col for col in X_train_full.columns 
                if col not in cols_to_drop]

X = X_train_full[feature_cols]
y = train_data['customer_experience']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print("\nNumber of features:", len(feature_cols))

# Encode target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Ensure test data has same columns as training data
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
X_test_scaled = scaler.transform(X_test)

# Optimized XGBoost parameters
params = {
    'n_estimators': 500,
    'max_depth': 8,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'gamma': 0.1,
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 42,
    'n_jobs': -1
}

# Initialize and train model
print("\nTraining XGBoost model...")
model = xgb.XGBClassifier(**params)

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
print(f"\nCross-validation F1 scores: {cv_scores}")
print(f"Mean CV F1 score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Train final model
model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

# Make predictions on validation set
y_pred = model.predict(X_val)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

# Calculate weighted F1 score
weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score: {weighted_f1:.4f}")

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

# Make predictions on test set
print("\nMaking predictions on test set...")
test_predictions = model.predict(X_test_scaled)
test_predictions_labels = le.inverse_transform(test_predictions)

# Create submission file
submission = pd.DataFrame({
    'id': range(len(test_predictions_labels)),
    'customer_experience': test_predictions_labels
})

# Save submission file
submission.to_csv('submission4.csv', index=False)
print("\nSubmission file created successfully!")

# Verify file contents
print("\nVerifying saved submission file:")
saved_submission = pd.read_csv('submission4.csv')
print(saved_submission.head())
print("\nShape of saved submission:", saved_submission.shape)

Loading datasets...

Train data shape: (206969, 26)
Test data shape: (137971, 25)

Preparing features...

Number of features: 58

Training XGBoost model...

Cross-validation F1 scores: [0.65360323 0.64754644 0.64469681 0.63932008 0.64904806]
Mean CV F1 score: 0.6468 (+/- 0.0095)
[0]	validation_0-mlogloss:1.09588
[1]	validation_0-mlogloss:1.09318
[2]	validation_0-mlogloss:1.09016
[3]	validation_0-mlogloss:1.08735
[4]	validation_0-mlogloss:1.08469
[5]	validation_0-mlogloss:1.08164
[6]	validation_0-mlogloss:1.07888
[7]	validation_0-mlogloss:1.07615
[8]	validation_0-mlogloss:1.07333
[9]	validation_0-mlogloss:1.07043
[10]	validation_0-mlogloss:1.06756
[11]	validation_0-mlogloss:1.06481
[12]	validation_0-mlogloss:1.06256
[13]	validation_0-mlogloss:1.06026
[14]	validation_0-mlogloss:1.05801
[15]	validation_0-mlogloss:1.05556
[16]	validation_0-mlogloss:1.05298
[17]	validation_0-mlogloss:1.05050
[18]	validation_0-mlogloss:1.04839
[19]	validation_0-mlogloss:1.04607
[20]	validation_0-mlogloss:1.0