In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.metrics import classification_report, f1_score
import xgboost as xgb
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer

def prepare_data(df):
    """
    Enhanced data preparation with advanced feature engineering
    """
    data = df.copy()
    
    # Advanced missing value imputation using KNN
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    
    # KNN imputation for numeric columns
    imputer = KNNImputer(n_neighbors=5)
    data[numeric_cols] = imputer.fit_transform(data[numeric_cols])
    
    # Fill categorical columns with mode
    for col in categorical_cols:
        data[col] = data[col].fillna(data[col].mode()[0])
    
    # Date processing (keeping original implementation plus new features)
    date_cols = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                 'released_date', 'estimated_delivery_date', 'received_date']
    
    for col in date_cols:
        if col in data.columns:
            data[col] = pd.to_datetime(data[col], errors='coerce')
            # Original date features
            data[f'{col}_year'] = data[col].dt.year.fillna(-1).astype(int)
            data[f'{col}_month'] = data[col].dt.month.fillna(-1).astype(int)
            data[f'{col}_day'] = data[col].dt.day.fillna(-1).astype(int)
            data[f'{col}_dayofweek'] = data[col].dt.dayofweek.fillna(-1).astype(int)
            data[f'{col}_quarter'] = data[col].dt.quarter.fillna(-1).astype(int)
            data[f'{col}_is_weekend'] = (data[col].dt.dayofweek >= 5).astype(int)
            data[f'{col}_is_month_end'] = (data[col].dt.is_month_end).astype(int)
            data[f'{col}_is_month_start'] = (data[col].dt.is_month_start).astype(int)
            data[f'{col}_hour'] = data[col].dt.hour.fillna(-1).astype(int)
            
            # New date features
            data[f'{col}_week'] = data[col].dt.isocalendar().week.fillna(-1).astype(int)
            data[f'{col}_is_holiday'] = ((data[col].dt.month == 12) & 
                                       (data[col].dt.day >= 20)).astype(int)
            data[f'{col}_season'] = pd.cut(data[col].dt.month, bins=[-np.inf, 3, 6, 9, np.inf], 
                                         labels=[0, 1, 2, 3]).astype(int)
    
    # Enhanced time difference features
    date_cols_dt = [col for col in date_cols if col in data.columns]
    for i in range(len(date_cols_dt)):
        for j in range(i + 1, len(date_cols_dt)):
            col1, col2 = date_cols_dt[i], date_cols_dt[j]
            diff_seconds = (data[col2] - data[col1]).dt.total_seconds()
            
            # Days difference
            data[f'{col1.split("_")[0]}_{col2.split("_")[0]}_diff_days'] = diff_seconds / (24*3600)
            # Hours difference
            data[f'{col1.split("_")[0]}_{col2.split("_")[0]}_diff_hours'] = diff_seconds / 3600
            # Business days difference
            data[f'{col1.split("_")[0]}_{col2.split("_")[0]}_business_days'] = np.busday_count(
                data[col1].dt.date.values.astype('datetime64[D]'),
                data[col2].dt.date.values.astype('datetime64[D]')
            )
    
    # Drop original date columns
    for col in date_cols:
        if col in data.columns:
            data = data.drop(columns=[col])
    
    # Enhanced price features
    if 'Product_value' in data.columns and 'final_payment' in data.columns:
        data['discount_amount'] = data['Product_value'] - data['final_payment']
        data['discount_percentage'] = (data['discount_amount'] / data['Product_value'] * 100).clip(0, 100)
        data['price_tier'] = pd.qcut(data['Product_value'], q=20, labels=False, duplicates='drop')  # Increased bins
        data['price_to_loyalty_ratio'] = data['Product_value'] / (data['loyalty_points_redeemed'] + 1)
        
        # Advanced price transformations
        data['log_product_value'] = np.log1p(data['Product_value'])
        data['log_final_payment'] = np.log1p(data['final_payment'])
        data['sqrt_product_value'] = np.sqrt(data['Product_value'])
        data['cubic_product_value'] = np.cbrt(data['Product_value'])
    
    # Advanced categorical encoding
    categorical_cols = ['Gender', 'Is_current_loyalty_program_member', 'loyalty_tier',
                       'payment_method', 'purchase_medium', 'shipping_method',
                       'product_category']
    
    existing_cat_cols = [col for col in categorical_cols if col in data.columns]
    
    # Target encoding with smoothing
    le_dict = {}
    for col in existing_cat_cols:
        le_dict[col] = LabelEncoder()
        data[col] = le_dict[col].fit_transform(data[col].astype(str))
        
        # Create interaction features between categorical variables
        for col2 in existing_cat_cols:
            if col != col2:
                data[f'{col}_{col2}_interaction'] = data[col] * data[col2]
    
    return data

def objective(trial):
    """
    Optuna objective function for hyperparameter optimization
    """
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'random_state': 42,
        'n_jobs': -1,
        'tree_method': 'hist'
    }
    
    cv_scores = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
        
        model = xgb.XGBClassifier(**params)
        model.fit(X_fold_train, y_fold_train,
                 eval_set=[(X_fold_val, y_fold_val)],
                 early_stopping_rounds=50,
                 verbose=False)
        
        fold_pred = model.predict(X_fold_val)
        fold_score = f1_score(y_fold_val, fold_pred, average='weighted')
        cv_scores.append(fold_score)
    
    return np.mean(cv_scores)

# Load and prepare data
print("Loading datasets...")
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

print("\nPreparing features...")
X_train_full = prepare_data(train_data)
X_test = prepare_data(test_data)

# Remove non-feature columns
cols_to_drop = ['customer_experience', 'user_id', 'transaction_id', 
                'order_id', 'tracking_number']
feature_cols = [col for col in X_train_full.columns if col not in cols_to_drop]

X = X_train_full[feature_cols]
y = train_data['customer_experience']

# Feature selection using Random Forest
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
selector = SelectFromModel(rf_selector, prefit=False)
selector.fit(X, y)
selected_features = X.columns[selector.get_support()].tolist()
X = X[selected_features]
X_test = X_test[selected_features]

# Robust scaling
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
scaler = RobustScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Target encoding
le = LabelEncoder()
y = le.fit_transform(y)

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Hyperparameter optimization
print("\nOptimizing hyperparameters with Optuna...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
best_params.update({
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist'
})

print("\nBest parameters:", best_params)

# Train final model with best parameters
print("\nTraining final model...")
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=50
)

# Evaluate model
y_pred = final_model.predict(X_val)
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score: {weighted_f1:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': final_model.feature_importances_
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

# Make predictions on test set
print("\nMaking predictions on test set...")
test_predictions = final_model.predict(X_test)
test_predictions_labels = le.inverse_transform(test_predictions)

# Create submission file
submission = pd.DataFrame({
    'id': range(len(test_predictions_labels)),
    'customer_experience': test_predictions_labels
})
submission.to_csv('submission11.csv', index=False)
print("\nSubmission file created successfully!")

Loading datasets...

Preparing features...


KeyboardInterrupt: 