In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import classification_report, f1_score
import xgboost as xgb
from datetime import datetime
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce

def prepare_data(df):
    """
    Enhanced data preparation with advanced feature engineering
    """
    data = df.copy()
    
    # Advanced missing value handling
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    
    # Fill numeric columns with interpolation where possible
    for col in numeric_cols:
        if data[col].isnull().sum() > 0:
            data[col] = data[col].interpolate(method='linear', limit_direction='both')
            data[col] = data[col].fillna(data[col].median())
    
    # Advanced categorical encoding
    for col in categorical_cols:
        data[col] = data[col].fillna('MISSING')
        if data[col].nunique() < 10:  # For low cardinality
            data[f'{col}_freq'] = data[col].map(data[col].value_counts(normalize=True))
    
    # Enhanced date feature engineering
    date_cols = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                 'released_date', 'estimated_delivery_date', 'received_date']
    
    for col in date_cols:
        if col in data.columns:
            data[col] = pd.to_datetime(data[col], errors='coerce')
            # Basic date features
            data[f'{col}_year'] = data[col].dt.year.fillna(-1).astype(int)
            data[f'{col}_month'] = data[col].dt.month.fillna(-1).astype(int)
            data[f'{col}_day'] = data[col].dt.day.fillna(-1).astype(int)
            data[f'{col}_dayofweek'] = data[col].dt.dayofweek.fillna(-1).astype(int)
            data[f'{col}_quarter'] = data[col].dt.quarter.fillna(-1).astype(int)
            data[f'{col}_is_weekend'] = (data[col].dt.dayofweek >= 5).astype(int)
            # Advanced date features
            data[f'{col}_is_month_start'] = data[col].dt.is_month_start.astype(int)
            data[f'{col}_is_month_end'] = data[col].dt.is_month_end.astype(int)
            data[f'{col}_sin_month'] = np.sin(2 * np.pi * data[col].dt.month/12)
            data[f'{col}_cos_month'] = np.cos(2 * np.pi * data[col].dt.month/12)
    
    # Advanced time difference features
    if 'payment_datetime' in data.columns and 'purchased_datetime' in data.columns:
        data['payment_purchase_diff'] = (data['payment_datetime'] - data['purchased_datetime']).dt.total_seconds() / 3600
        data['payment_purchase_diff_days'] = data['payment_purchase_diff'] / 24
        data['payment_purchase_same_day'] = (data['payment_purchase_diff_days'].abs() < 1).astype(int)
    
    if 'estimated_delivery_date' in data.columns and 'received_date' in data.columns:
        data['delivery_delay'] = (data['received_date'] - data['estimated_delivery_date']).dt.total_seconds() / (24*3600)
        data['delivery_on_time'] = (data['delivery_delay'] <= 0).astype(int)
        data['delivery_delay_squared'] = data['delivery_delay'] ** 2
    
    # Drop original date columns
    for col in date_cols:
        if col in data.columns:
            data = data.drop(columns=[col])
    
    # Enhanced price features
    if 'Product_value' in data.columns and 'final_payment' in data.columns:
        data['discount_amount'] = data['Product_value'] - data['final_payment']
        data['discount_percentage'] = (data['discount_amount'] / data['Product_value'] * 100).clip(0, 100)
        data['price_tier'] = pd.qcut(data['Product_value'], q=5, labels=[1,2,3,4,5], duplicates='drop')
        data['price_to_discount_ratio'] = data['Product_value'] / (data['discount_amount'] + 1)
        data['final_payment_log'] = np.log1p(data['final_payment'])
    
    # Advanced loyalty features
    if 'loyalty_points_redeemed' in data.columns:
        data['has_redeemed_points'] = (data['loyalty_points_redeemed'] > 0).astype(int)
        data['loyalty_points_log'] = np.log1p(data['loyalty_points_redeemed'])
    
    # Categorical encoding with target encoding for high cardinality
    categorical_cols = ['Gender', 'Is_current_loyalty_program_member', 'loyalty_tier',
                       'payment_method', 'purchase_medium', 'shipping_method',
                       'product_category']
    
    existing_cat_cols = [col for col in categorical_cols if col in data.columns]
    
    # Use both label encoding and frequency encoding
    le_dict = {}
    for col in existing_cat_cols:
        le_dict[col] = LabelEncoder()
        data[col] = le_dict[col].fit_transform(data[col].astype(str))
        data[f'{col}_freq'] = data[col].map(data[col].value_counts(normalize=True))
    
    return data

def train_model(X_train, X_val, y_train, y_val):
    """
    Train model with optimized parameters and learning rate scheduling
    """
    # Optimized XGBoost parameters
    params = {
        'n_estimators': 1000,
        'max_depth': 7,
        'learning_rate': 0.01,
        'subsample': 0.85,
        'colsample_bytree': 0.85,
        'min_child_weight': 2,
        'gamma': 0.05,
        'alpha': 0.1,
        'lambda': 1,
        'objective': 'multi:softprob',
        'eval_metric': ['mlogloss', 'merror'],
        'random_state': 42,
        'n_jobs': -1,
        'tree_method': 'hist'  # For faster training
    }
    
    # Initialize model
    model = xgb.XGBClassifier(**params)
    
    # Early stopping callback
    early_stopping_rounds = 50
    
    # Train with early stopping
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric=['merror', 'mlogloss'],
        early_stopping_rounds=early_stopping_rounds,
        verbose=100
    )
    
    return model

# Main execution
print("Loading datasets...")
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

print("\nPreparing features...")
X_train_full = prepare_data(train_data)
X_test = prepare_data(test_data)

# Remove non-feature columns
cols_to_drop = ['customer_experience', 'user_id', 'transaction_id', 
                'order_id', 'tracking_number']
feature_cols = [col for col in X_train_full.columns 
                if col not in cols_to_drop]

X = X_train_full[feature_cols]
y = train_data['customer_experience']

# Use RobustScaler for better handling of outliers
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Encode target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Stratified split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Feature selection using Random Forest
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42)
selector = SelectFromModel(rf_selector, prefit=False, threshold='median')
selector.fit(X_train, y_train)
selected_features = X_train.columns[selector.get_support()].tolist()

# Use selected features
X_train = X_train[selected_features]
X_val = X_val[selected_features]

print(f"\nSelected {len(selected_features)} features out of {X_scaled.shape[1]}")

# Train model
print("\nTraining XGBoost model...")
model = train_model(X_train, X_val, y_train, y_val)

# Cross-validation with stratification
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1_weighted')
print(f"\nCross-validation F1 scores: {cv_scores}")
print(f"Mean CV F1 score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Validation predictions
y_pred = model.predict(X_val)
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': model.feature_importances_
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

# Prepare test data with selected features
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
X_test_scaled = scaler.transform(X_test)

# Make predictions
print("\nMaking predictions on test set...")
test_predictions = model.predict(X_test_scaled)
test_predictions_labels = le.inverse_transform(test_predictions)

# Create submission
submission = pd.DataFrame({
    'id': range(len(test_predictions_labels)),
    'customer_experience': test_predictions_labels
})

submission.to_csv('submission7.csv', index=False)
print("\nSubmission file created successfully!")

# Verify submission
print("\nVerifying saved submission file:")
saved_submission = pd.read_csv('submission7.csv')
print(saved_submission.head())
print("\nShape of saved submission:", saved_submission.shape)