In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.metrics import classification_report, f1_score
import xgboost as xgb
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel

def prepare_data(df):
    """
    Enhanced data preparation with optimized feature engineering
    """
    data = df.copy()
    
    # Optimize memory usage
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    
    # More efficient filling of missing values
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())
    data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])
    
    # Optimized date processing
    date_cols = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                 'released_date', 'estimated_delivery_date', 'received_date']
    
    for col in date_cols:
        if col in data.columns:
            data[col] = pd.to_datetime(data[col], errors='coerce')
            data[f'{col}_year'] = data[col].dt.year.fillna(-1).astype('int16')
            data[f'{col}_month'] = data[col].dt.month.fillna(-1).astype('int8')
            data[f'{col}_day'] = data[col].dt.day.fillna(-1).astype('int8')
            data[f'{col}_dayofweek'] = data[col].dt.dayofweek.fillna(-1).astype('int8')
            data[f'{col}_quarter'] = data[col].dt.quarter.fillna(-1).astype('int8')
            
            # New temporal features
            data[f'{col}_week'] = data[col].dt.isocalendar().week.fillna(-1).astype('int8')
            data[f'{col}_day_part'] = pd.cut(data[col].dt.hour.fillna(-1), 
                                           bins=[-1, 6, 12, 18, 24], 
                                           labels=[0, 1, 2, 3]).astype('int8')
    
    # Enhanced time difference features
    date_cols_dt = [col for col in date_cols if col in data.columns]
    for i in range(len(date_cols_dt)):
        for j in range(i + 1, len(date_cols_dt)):
            col1, col2 = date_cols_dt[i], date_cols_dt[j]
            diff_name = f'{col1.split("_")[0]}_{col2.split("_")[0]}_diff_days'
            data[diff_name] = (data[col2] - data[col1]).dt.total_seconds() / (24*3600)
            
            # Add business days feature
            data[f'{diff_name}_business'] = np.busday_count(
                data[col1].dt.date.values.astype('datetime64[D]'),
                data[col2].dt.date.values.astype('datetime64[D]')
            )
    
    # Drop original date columns
    data = data.drop(columns=[col for col in date_cols if col in data.columns])
    
    # Enhanced price features
    if all(col in data.columns for col in ['Product_value', 'final_payment']):
        data['discount_amount'] = data['Product_value'] - data['final_payment']
        data['discount_percentage'] = (data['discount_amount'] / data['Product_value'] * 100).clip(0, 100)
        data['price_tier'] = pd.qcut(data['Product_value'], q=5, labels=False, duplicates='drop')
        
        # New price features
        data['price_per_loyalty_point'] = data['Product_value'] / (data['loyalty_points_redeemed'].clip(1))
        data['relative_price'] = data['Product_value'] / data.groupby('product_category')['Product_value'].transform('mean')
    
    # Optimized categorical encoding
    cat_cols = ['Gender', 'Is_current_loyalty_program_member', 'loyalty_tier',
                'payment_method', 'purchase_medium', 'shipping_method',
                'product_category']
    
    existing_cat_cols = [col for col in cat_cols if col in data.columns]
    
    # Target encoding with smoothing
    le_dict = {}
    for col in existing_cat_cols:
        le_dict[col] = LabelEncoder()
        data[col] = le_dict[col].fit_transform(data[col].astype(str))
        
    return data

# Load and prepare data
print("Loading datasets...")
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

print("\nPreparing features...")
X_train_full = prepare_data(train_data)
X_test = prepare_data(test_data)

# Feature selection
cols_to_drop = ['customer_experience', 'user_id', 'transaction_id', 
                'order_id', 'tracking_number']
feature_cols = [col for col in X_train_full.columns if col not in cols_to_drop]

X = X_train_full[feature_cols]
y = train_data['customer_experience']

# Optimize numeric features
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
power = PowerTransformer(method='yeo-johnson')

X_numeric = X[numeric_features]
X_numeric_scaled = scaler.fit_transform(X_numeric)
X_numeric_transformed = power.fit_transform(X_numeric_scaled)
X[numeric_features] = X_numeric_transformed

# Optimized PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
pca_features = pca.fit_transform(X_numeric_transformed)
for i in range(pca_features.shape[1]):
    X[f'pca_feature_{i}'] = pca_features[:, i]

# Target encoding
le = LabelEncoder()
y = le.fit_transform(y)

# Optimized train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Prepare test data
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
X_test_numeric = X_test[numeric_features]
X_test_numeric_scaled = scaler.transform(X_test_numeric)
X_test_numeric_transformed = power.transform(X_test_numeric_scaled)
X_test[numeric_features] = X_test_numeric_transformed

# Add PCA features to test data
test_pca_features = pca.transform(X_test_numeric_transformed)
for i in range(test_pca_features.shape[1]):
    X_test[f'pca_feature_{i}'] = test_pca_features[:, i]

# Optimized XGBoost parameters
params = {
    'n_estimators': 800,
    'max_depth': 8,
    'learning_rate': 0.015,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'min_child_weight': 2,
    'gamma': 0.05,
    'alpha': 0.05,
    'lambda': 0.8,
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    'max_leaves': 128,
    'early_stopping_rounds':10
}

# Train model with early stopping
print("\nTraining XGBoost model...")
model = xgb.XGBClassifier(**params)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    
    verbose=False
)

# Make predictions
y_pred = model.predict(X_val)
weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score: {weighted_f1:.4f}")

# Generate predictions
test_predictions = model.predict(X_test)
test_predictions_labels = le.inverse_transform(test_predictions)

# Create submission
submission = pd.DataFrame({
    'id': range(len(test_predictions_labels)),
    'customer_experience': test_predictions_labels
})
submission.to_csv('submission13.csv', index=False)
print("\nSubmission file created successfully!")

Loading datasets...

Preparing features...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_features] = X_numeric_transformed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[f'pca_feature_{i}'] = pca_features[:, i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[f'pca_feature_{i}'] = pca_features[:, i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin


Training XGBoost model...

Weighted F1 Score: 0.6957

Submission file created successfully!
