In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, f1_score
import xgboost as xgb

def prepare_data(df, is_train=True):
    """
    Optimized data preparation with better performance
    """
    # Create a dictionary to store all new features
    new_features = {}
    
    # Remove ID columns first
    id_columns = ['user_id', 'transaction_id', 'order_id', 'tracking_number']
    data = df.drop(columns=id_columns, errors='ignore')
    
    if is_train and 'customer_experience' in data.columns:
        data = data.drop('customer_experience', axis=1)
    
    # Handle numeric and categorical columns
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    
    # Fill missing values
    for col in numeric_cols:
        data[col] = data[col].fillna(data[col].median())
    
    for col in categorical_cols:
        data[col] = data[col].fillna('missing')
    
    # Process date columns
    date_cols = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                 'released_date', 'estimated_delivery_date', 'received_date']
    
    for col in date_cols:
        if col in data.columns:
            dt_series = pd.to_datetime(data[col], errors='coerce')
            
            # Basic date features
            new_features[f'{col}_year'] = dt_series.dt.year.fillna(-1).astype(int)
            new_features[f'{col}_month'] = dt_series.dt.month.fillna(-1).astype(int)
            new_features[f'{col}_day'] = dt_series.dt.day.fillna(-1).astype(int)
            new_features[f'{col}_dayofweek'] = dt_series.dt.dayofweek.fillna(-1).astype(int)
            new_features[f'{col}_hour'] = dt_series.dt.hour.fillna(-1).astype(int)
            new_features[f'{col}_is_weekend'] = (dt_series.dt.dayofweek >= 5).astype(int)
            new_features[f'{col}_is_month_end'] = dt_series.dt.is_month_end.astype(int)
            new_features[f'{col}_quarter'] = dt_series.dt.quarter.fillna(-1).astype(int)
    
    # Calculate time differences
    date_cols_present = [col for col in date_cols if col in data.columns]
    for i in range(len(date_cols_present)):
        for j in range(i + 1, len(date_cols_present)):
            col1, col2 = date_cols_present[i], date_cols_present[j]
            time_diff = (pd.to_datetime(data[col2], errors='coerce') - 
                        pd.to_datetime(data[col1], errors='coerce')).dt.total_seconds() / (24*3600)
            diff_name = f'{col1.split("_")[0]}_{col2.split("_")[0]}_diff_days'
            new_features[diff_name] = time_diff
            new_features[f'{diff_name}_abs'] = abs(time_diff)
    
    # Price features
    if 'Product_value' in data.columns and 'final_payment' in data.columns:
        new_features['discount_amount'] = data['Product_value'] - data['final_payment']
        new_features['discount_percentage'] = ((data['Product_value'] - data['final_payment']) / 
                                            data['Product_value'] * 100).clip(0, 100)
        new_features['payment_ratio'] = (data['final_payment'] / data['Product_value']).clip(0, 1)
        new_features['log_product_value'] = np.log1p(data['Product_value'])
        new_features['log_final_payment'] = np.log1p(data['final_payment'])
        
        # Price tiers using qcut
        new_features['price_tier'] = pd.qcut(data['Product_value'], 
                                           q=10, 
                                           labels=False, 
                                           duplicates='drop').astype(int)
    
    # Loyalty features
    if 'loyalty_points_redeemed' in data.columns:
        new_features['has_loyalty_points'] = (data['loyalty_points_redeemed'] > 0).astype(int)
        new_features['log_loyalty_points'] = np.log1p(data['loyalty_points_redeemed'])
    
    # Encode categorical variables
    categorical_cols = data.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
    
    # Drop original date columns
    data = data.drop(columns=[col for col in date_cols if col in data.columns])
    
    # Add all new features at once
    new_features_df = pd.DataFrame(new_features, index=data.index)
    data = pd.concat([data, new_features_df], axis=1)
    
    return data

# Load datasets
print("Loading datasets...")
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

# Prepare target variable before feature engineering
y = train_data['customer_experience']
le = LabelEncoder()
y = le.fit_transform(y)

print("\nPreparing features...")
X_train = prepare_data(train_data, is_train=True)
X_test = prepare_data(test_data, is_train=False)

# Ensure X_train and X_test have the same columns
common_columns = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_columns]
X_test = X_test[common_columns]

# Scale features
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y, test_size=0.2, random_state=42, stratify=y
)

# XGBoost parameters
params = {
    'n_estimators': 1000,
    'max_depth': 8,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'gamma': 0.1,
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist',
    'enable_categorical': True,
    'early_stopping_rounds':50
}

# Train model
print("\nTraining model...")
model = xgb.XGBClassifier(**params)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    
    verbose=100
)

# Evaluate model
y_pred = model.predict(X_val)
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score: {weighted_f1:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

# Make predictions on test set
print("\nMaking predictions on test set...")
test_predictions = model.predict(X_test)
test_predictions_labels = le.inverse_transform(test_predictions)

# Create submission file
submission = pd.DataFrame({
    'id': range(len(test_predictions_labels)),
    'customer_experience': test_predictions_labels
})
submission.to_csv('submission12.csv', index=False)
print("\nSubmission file created successfully!")

Loading datasets...

Preparing features...

Training model...
[0]	validation_0-mlogloss:1.09495
[100]	validation_0-mlogloss:0.84505
[200]	validation_0-mlogloss:0.76715
[300]	validation_0-mlogloss:0.73963
[400]	validation_0-mlogloss:0.72753
[500]	validation_0-mlogloss:0.72212
[600]	validation_0-mlogloss:0.71978
[700]	validation_0-mlogloss:0.71851
[800]	validation_0-mlogloss:0.71792
[900]	validation_0-mlogloss:0.71770
[999]	validation_0-mlogloss:0.71759

Classification Report:
              precision    recall  f1-score   support

         bad       0.60      0.71      0.65     13708
        good       0.72      0.67      0.70     18278
     neutral       0.82      0.69      0.75      9408

    accuracy                           0.69     41394
   macro avg       0.71      0.69      0.70     41394
weighted avg       0.70      0.69      0.69     41394


Weighted F1 Score: 0.6946

Top 10 Most Important Features:
                           feature  importance
60        received_date_is_weeke