In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.metrics import classification_report, f1_score
import lightgbm as lgb
from datetime import datetime
from sklearn.decomposition import PCA

def prepare_data(df):
    """
    Enhanced data preparation with advanced feature engineering
    """
    data = df.copy()
    
    # Basic preprocessing
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    
    for col in numeric_cols:
        data[col] = data[col].fillna(data[col].median())
    
    for col in categorical_cols:
        data[col] = data[col].fillna(data[col].mode()[0])
    
    # Date features
    date_cols = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                 'released_date', 'estimated_delivery_date', 'received_date']
    
    for col in date_cols:
        if col in data.columns:
            data[col] = pd.to_datetime(data[col], errors='coerce')
            # Basic date features
            data[f'{col}_year'] = data[col].dt.year.fillna(-1).astype(int)
            data[f'{col}_month'] = data[col].dt.month.fillna(-1).astype(int)
            data[f'{col}_day'] = data[col].dt.day.fillna(-1).astype(int)
            data[f'{col}_dayofweek'] = data[col].dt.dayofweek.fillna(-1).astype(int)
            data[f'{col}_quarter'] = data[col].dt.quarter.fillna(-1).astype(int)
            data[f'{col}_hour'] = data[col].dt.hour.fillna(-1).astype(int)
            
            # Advanced date features
            data[f'{col}_is_weekend'] = (data[col].dt.dayofweek >= 5).astype(int)
            data[f'{col}_is_month_end'] = (data[col].dt.is_month_end).astype(int)
            data[f'{col}_is_month_start'] = (data[col].dt.is_month_start).astype(int)
            data[f'{col}_week'] = data[col].dt.isocalendar().week.fillna(-1).astype(int)
            data[f'{col}_is_holiday'] = ((data[col].dt.month == 12) & (data[col].dt.day >= 20)).astype(int)
            data[f'{col}_part_of_day'] = pd.cut(data[col].dt.hour, 
                                               bins=[-np.inf, 6, 12, 18, np.inf], 
                                               labels=[0, 1, 2, 3]).astype(float).fillna(-1).astype(int)
    
    # Time differences and derived features
    date_cols_dt = [col for col in date_cols if col in data.columns]
    for i in range(len(date_cols_dt)):
        for j in range(i + 1, len(date_cols_dt)):
            col1, col2 = date_cols_dt[i], date_cols_dt[j]
            diff_days = f'{col1.split("_")[0]}_{col2.split("_")[0]}_diff_days'
            data[diff_days] = (data[col2] - data[col1]).dt.total_seconds() / (24*3600)
            
            # Add derived time difference features
            data[f'{diff_days}_squared'] = data[diff_days] ** 2
            data[f'{diff_days}_log'] = np.log1p(np.abs(data[diff_days]))
            data[f'{diff_days}_is_delayed'] = (data[diff_days] > 0).astype(int)
    
    for col in date_cols:
        if col in data.columns:
            data = data.drop(columns=[col])
    
    # Enhanced price features
    if 'Product_value' in data.columns and 'final_payment' in data.columns:
        data['discount_amount'] = data['Product_value'] - data['final_payment']
        data['discount_percentage'] = (data['discount_amount'] / data['Product_value'] * 100).clip(0, 100)
        data['price_tier'] = pd.qcut(data['Product_value'], q=20, labels=False, duplicates='drop')
        data['final_payment_tier'] = pd.qcut(data['final_payment'], q=20, labels=False, duplicates='drop')
        
        # Advanced price features
        data['log_product_value'] = np.log1p(data['Product_value'])
        data['log_final_payment'] = np.log1p(data['final_payment'])
        data['price_to_discount_ratio'] = data['Product_value'] / (data['discount_amount'] + 1)
        data['high_value_purchase'] = (data['price_tier'] >= 15).astype(int)
        data['low_value_purchase'] = (data['price_tier'] <= 5).astype(int)
        
        # Binned features
        data['discount_level'] = pd.qcut(data['discount_percentage'], q=5, labels=False, duplicates='drop')
    
    # Enhanced loyalty features
    if 'loyalty_points_redeemed' in data.columns:
        data['has_redeemed_points'] = (data['loyalty_points_redeemed'] > 0).astype(int)
        data['log_loyalty_points'] = np.log1p(data['loyalty_points_redeemed'])
        data['loyalty_tier_numeric'] = pd.qcut(data['loyalty_points_redeemed'], q=10, labels=False, duplicates='drop')
        data['high_loyalty'] = (data['loyalty_tier_numeric'] >= 7).astype(int)
        
        # Interaction features
        if 'Product_value' in data.columns:
            data['loyalty_price_ratio'] = data['loyalty_points_redeemed'] / (data['Product_value'] + 1)
            data['loyalty_discount_interaction'] = data['loyalty_points_redeemed'] * data['discount_percentage']
    
    # Categorical encoding
    categorical_cols = ['Gender', 'Is_current_loyalty_program_member', 'loyalty_tier',
                       'payment_method', 'purchase_medium', 'shipping_method',
                       'product_category']
    
    existing_cat_cols = [col for col in categorical_cols if col in data.columns]
    
    for col in existing_cat_cols:
        data[col] = LabelEncoder().fit_transform(data[col].astype(str))
        
        # Create frequency encoding
        freq_map = data[col].value_counts(normalize=True).to_dict()
        data[f'{col}_freq'] = data[col].map(freq_map)
    
    return data

# Main execution code
print("Loading datasets...")
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

print("\nTrain data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

print("\nPreparing features...")
X_train_full = prepare_data(train_data)
X_test = prepare_data(test_data)

cols_to_drop = ['customer_experience', 'user_id', 'transaction_id', 
                'order_id', 'tracking_number']
feature_cols = [col for col in X_train_full.columns 
                if col not in cols_to_drop]

X = X_train_full[feature_cols]
y = train_data['customer_experience']

# Feature processing
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
power = PowerTransformer(method='yeo-johnson')

X_numeric = X[numeric_features]
X_numeric_scaled = scaler.fit_transform(X_numeric)
X_numeric_transformed = power.fit_transform(X_numeric_scaled)
X[numeric_features] = X_numeric_transformed

# PCA features
pca = PCA(n_components=15)  # Increased from 10
pca_features = pca.fit_transform(X_numeric_transformed)
for i in range(pca_features.shape[1]):
    X[f'pca_feature_{i}'] = pca_features[:, i]

print("\nNumber of features:", len(X.columns))

# Target encoding
le = LabelEncoder()
y = le.fit_transform(y)

# Data split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Prepare test data
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
X_test_numeric = X_test[numeric_features]
X_test_numeric_scaled = scaler.transform(X_test_numeric)
X_test_numeric_transformed = power.transform(X_test_numeric_scaled)
X_test[numeric_features] = X_test_numeric_transformed

# Add PCA features to test data
test_pca_features = pca.transform(X_test_numeric_transformed)
for i in range(test_pca_features.shape[1]):
    X_test[f'pca_feature_{i}'] = test_pca_features[:, i]

# Optimized LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y)),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 63,  # Increased from default
    'learning_rate': 0.005,  # Decreased for better generalization
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': 8,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'reg_alpha': 0.1,
    'reg_lambda': 1,
    'random_state': 42,
    'n_jobs': -1,
    'force_col_wise': True
}

print("\nTraining LightGBM model with cross-validation...")
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
    
    train_data = lgb.Dataset(X_fold_train, label=y_fold_train)
    val_data = lgb.Dataset(X_fold_val, label=y_fold_val, reference=train_data)
    
    fold_model = lgb.train(
        params,
        train_data,
        num_boost_round=2000,  # Increased from 1000
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(100, verbose=False)]  # Increased patience
    )
    
    fold_pred = np.argmax(fold_model.predict(X_fold_val), axis=1)
    fold_score = f1_score(y_fold_val, fold_pred, average='weighted')
    cv_scores.append(fold_score)
    print(f"Fold {fold} F1 Score: {fold_score:.4f}")

print(f"\nMean CV F1 score: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")

# Train final model
print("\nTraining final model...")
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

final_model = lgb.train(
    params,
    train_data,
    num_boost_round=2000,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(100, verbose=False)]
)

y_pred = np.argmax(final_model.predict(X_val), axis=1)

print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score: {weighted_f1:.4f}")

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': final_model.feature_importance()
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

print("\nMaking predictions on test set...")
test_predictions = np.argmax(final_model.predict(X_test), axis=1)
test_predictions_labels = le.inverse_transform(test_predictions)

submission = pd.DataFrame({
    'id': range(len(test_predictions_labels)),
    'customer_experience': test_predictions_labels
})

submission.to_csv('submission10.csv', index=False)
print("\nSubmission file created successfully!")

Loading datasets...

Train data shape: (206969, 26)
Test data shape: (137971, 25)

Preparing features...


  data[f'{diff_days}_log'] = np.log1p(np.abs(data[diff_days]))
  data[f'{diff_days}_is_delayed'] = (data[diff_days] > 0).astype(int)
  data[diff_days] = (data[col2] - data[col1]).dt.total_seconds() / (24*3600)
  data[f'{diff_days}_squared'] = data[diff_days] ** 2
  data[f'{diff_days}_log'] = np.log1p(np.abs(data[diff_days]))
  data[f'{diff_days}_is_delayed'] = (data[diff_days] > 0).astype(int)
  data[diff_days] = (data[col2] - data[col1]).dt.total_seconds() / (24*3600)
  data[f'{diff_days}_squared'] = data[diff_days] ** 2
  data[f'{diff_days}_log'] = np.log1p(np.abs(data[diff_days]))
  data[f'{diff_days}_is_delayed'] = (data[diff_days] > 0).astype(int)
  data[diff_days] = (data[col2] - data[col1]).dt.total_seconds() / (24*3600)
  data[f'{diff_days}_squared'] = data[diff_days] ** 2
  data[f'{diff_days}_log'] = np.log1p(np.abs(data[diff_days]))
  data[f'{diff_days}_is_delayed'] = (data[diff_days] > 0).astype(int)
  data[diff_days] = (data[col2] - data[col1]).dt.total_seconds() / (24*3600


Number of features: 185

Training LightGBM model with cross-validation...
Fold 1 F1 Score: 0.6952
Fold 2 F1 Score: 0.6922
Fold 3 F1 Score: 0.6892
Fold 4 F1 Score: 0.6920
Fold 5 F1 Score: 0.6901

Mean CV F1 score: 0.6917 (+/- 0.0041)

Training final model...

Classification Report:
              precision    recall  f1-score   support

         bad       0.60      0.71      0.65     13708
        good       0.73      0.67      0.70     18278
     neutral       0.82      0.70      0.75      9408

    accuracy                           0.69     41394
   macro avg       0.71      0.69      0.70     41394
weighted avg       0.70      0.69      0.69     41394


Weighted F1 Score: 0.6942

Top 10 Most Important Features:
                                feature  importance
147                     discount_amount       32280
120  payment_received_diff_days_squared       11781
153             price_to_discount_ratio       11764
119          payment_received_diff_days       11575
180             