In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.metrics import classification_report, f1_score
import lightgbm as lgb
from datetime import datetime
from sklearn.decomposition import PCA

def prepare_data(df):
    """
    Enhanced data preparation with advanced feature engineering
    """
    # Create a copy to avoid modifying original data
    data = df.copy()
    
    # Fill missing values with appropriate strategies
    numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = data.select_dtypes(include=['object']).columns
    
    # Fill numeric columns with median
    for col in numeric_cols:
        data[col] = data[col].fillna(data[col].median())
    
    # Fill categorical columns with mode
    for col in categorical_cols:
        data[col] = data[col].fillna(data[col].mode()[0])
    
    # Convert date columns to datetime and extract features
    date_cols = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                 'released_date', 'estimated_delivery_date', 'received_date']
    
    for col in date_cols:
        if col in data.columns:
            data[col] = pd.to_datetime(data[col], errors='coerce')
            # Extract numerical features from dates
            data[f'{col}_year'] = data[col].dt.year.fillna(-1).astype(int)
            data[f'{col}_month'] = data[col].dt.month.fillna(-1).astype(int)
            data[f'{col}_day'] = data[col].dt.day.fillna(-1).astype(int)
            data[f'{col}_dayofweek'] = data[col].dt.dayofweek.fillna(-1).astype(int)
            data[f'{col}_quarter'] = data[col].dt.quarter.fillna(-1).astype(int)
            data[f'{col}_is_weekend'] = (data[col].dt.dayofweek >= 5).astype(int)
            data[f'{col}_is_month_end'] = (data[col].dt.is_month_end).astype(int)
            data[f'{col}_is_month_start'] = (data[col].dt.is_month_start).astype(int)
            data[f'{col}_hour'] = data[col].dt.hour.fillna(-1).astype(int)
    
    # Calculate time differences between all date pairs
    date_cols_dt = [col for col in date_cols if col in data.columns]
    for i in range(len(date_cols_dt)):
        for j in range(i + 1, len(date_cols_dt)):
            col1, col2 = date_cols_dt[i], date_cols_dt[j]
            diff_name = f'{col1.split("_")[0]}_{col2.split("_")[0]}_diff_days'
            data[diff_name] = (data[col2] - data[col1]).dt.total_seconds() / (24*3600)
    
    # Drop original date columns
    for col in date_cols:
        if col in data.columns:
            data = data.drop(columns=[col])
    
    # Create price-related features
    if 'Product_value' in data.columns and 'final_payment' in data.columns:
        data['discount_amount'] = data['Product_value'] - data['final_payment']
        data['discount_percentage'] = (data['discount_amount'] / data['Product_value'] * 100).clip(0, 100)
        data['price_tier'] = pd.qcut(data['Product_value'], q=10, labels=False, duplicates='drop')
        data['price_to_loyalty_ratio'] = data['Product_value'] / (data['loyalty_points_redeemed'] + 1)
        
        # Log transform price features
        data['log_product_value'] = np.log1p(data['Product_value'])
        data['log_final_payment'] = np.log1p(data['final_payment'])
    
    # Enhanced loyalty features
    if 'loyalty_points_redeemed' in data.columns:
        data['has_redeemed_points'] = (data['loyalty_points_redeemed'] > 0).astype(int)
        data['log_loyalty_points'] = np.log1p(data['loyalty_points_redeemed'])
        data['points_to_value_ratio'] = data['loyalty_points_redeemed'] / (data['Product_value'] + 1)
    
    # Aggregate discount features
    discount_cols = [col for col in data.columns if 'discount_percentage' in col]
    if discount_cols:
        data['total_discount'] = data[discount_cols].sum(axis=1)
        data['max_discount'] = data[discount_cols].max(axis=1)
        data['min_discount'] = data[discount_cols].min(axis=1)
        data['discount_range'] = data['max_discount'] - data['min_discount']
    
    # Interaction features
    if 'loyalty_tier' in data.columns:
        data['loyalty_tier'] = data['loyalty_tier'].astype(str)
        data['loyalty_tier_num'] = LabelEncoder().fit_transform(data['loyalty_tier'])
        data['tier_price_interaction'] = data['loyalty_tier_num'] * data['Product_value']
    
    # Encode categorical variables
    categorical_cols = ['Gender', 'Is_current_loyalty_program_member', 'loyalty_tier',
                       'payment_method', 'purchase_medium', 'shipping_method',
                       'product_category']
    
    # Only encode categorical columns that exist in the dataset
    existing_cat_cols = [col for col in categorical_cols if col in data.columns]
    
    # Use LabelEncoder for categorical variables
    le_dict = {}
    for col in existing_cat_cols:
        le_dict[col] = LabelEncoder()
        data[col] = le_dict[col].fit_transform(data[col].astype(str))
        
    return data

# Load datasets
print("Loading datasets...")
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')

print("\nTrain data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

# Prepare features
print("\nPreparing features...")
X_train_full = prepare_data(train_data)
X_test = prepare_data(test_data)

# Remove non-feature columns from training data
cols_to_drop = ['customer_experience', 'user_id', 'transaction_id', 
                'order_id', 'tracking_number']
feature_cols = [col for col in X_train_full.columns 
                if col not in cols_to_drop]

X = X_train_full[feature_cols]
y = train_data['customer_experience']

# Scale numerical features and apply power transform
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
power = PowerTransformer(method='yeo-johnson')

X_numeric = X[numeric_features]
X_numeric_scaled = scaler.fit_transform(X_numeric)
X_numeric_transformed = power.fit_transform(X_numeric_scaled)
X[numeric_features] = X_numeric_transformed

# Add PCA features
pca = PCA(n_components=10)
pca_features = pca.fit_transform(X_numeric_transformed)
for i in range(pca_features.shape[1]):
    X[f'pca_feature_{i}'] = pca_features[:, i]

print("\nNumber of features:", len(X.columns))

# Encode target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Prepare test data
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
X_test_numeric = X_test[numeric_features]
X_test_numeric_scaled = scaler.transform(X_test_numeric)
X_test_numeric_transformed = power.transform(X_test_numeric_scaled)
X_test[numeric_features] = X_test_numeric_transformed

# Add PCA features to test data
test_pca_features = pca.transform(X_test_numeric_transformed)
for i in range(test_pca_features.shape[1]):
    X_test[f'pca_feature_{i}'] = test_pca_features[:, i]

# LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y)),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': 7,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'reg_alpha': 0.1,
    'reg_lambda': 1,
    'random_state': 42,
    'n_jobs': -1
}

# Initialize and train model with StratifiedKFold
print("\nTraining LightGBM model with cross-validation...")
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_fold_train, label=y_fold_train)
    val_data = lgb.Dataset(X_fold_val, label=y_fold_val, reference=train_data)
    
    # Train model
    fold_model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    
    # Make predictions
    fold_pred = np.argmax(fold_model.predict(X_fold_val), axis=1)
    fold_score = f1_score(y_fold_val, fold_pred, average='weighted')
    cv_scores.append(fold_score)
    print(f"Fold {fold} F1 Score: {fold_score:.4f}")

print(f"\nMean CV F1 score: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")

# Train final model on full training data
print("\nTraining final model...")
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

final_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(50, verbose=False)]
)

# Make predictions on validation set
y_pred = np.argmax(final_model.predict(X_val), axis=1)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

# Calculate weighted F1 score
weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score: {weighted_f1:.4f}")

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': final_model.feature_importance()
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('importance', ascending=False).head(10))

# Make predictions on test set
print("\nMaking predictions on test set...")
test_predictions = np.argmax(final_model.predict(X_test), axis=1)
test_predictions_labels = le.inverse_transform(test_predictions)

# Create submission file
submission = pd.DataFrame({
    'id': range(len(test_predictions_labels)),
    'customer_experience': test_predictions_labels
})

# Save submission file
submission.to_csv('submission9.csv', index=False)
print("\nSubmission file created successfully!")

Loading datasets...

Train data shape: (206969, 26)
Test data shape: (137971, 25)

Preparing features...


  data['min_discount'] = data[discount_cols].min(axis=1)
  data['discount_range'] = data['max_discount'] - data['min_discount']
  data['loyalty_tier_num'] = LabelEncoder().fit_transform(data['loyalty_tier'])
  data['tier_price_interaction'] = data['loyalty_tier_num'] * data['Product_value']
  data['discount_range'] = data['max_discount'] - data['min_discount']
  data['loyalty_tier_num'] = LabelEncoder().fit_transform(data['loyalty_tier'])
  data['tier_price_interaction'] = data['loyalty_tier_num'] * data['Product_value']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_features] = X_numeric_transformed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht


Number of features: 109

Training LightGBM model with cross-validation...
Fold 1 F1 Score: 0.6961
Fold 2 F1 Score: 0.6929
Fold 3 F1 Score: 0.6904
Fold 4 F1 Score: 0.6911
Fold 5 F1 Score: 0.6905

Mean CV F1 score: 0.6922 (+/- 0.0043)

Training final model...

Classification Report:
              precision    recall  f1-score   support

         bad       0.60      0.71      0.65     13708
        good       0.73      0.67      0.70     18278
     neutral       0.81      0.70      0.75      9408

    accuracy                           0.69     41394
   macro avg       0.71      0.69      0.70     41394
weighted avg       0.70      0.69      0.69     41394


Weighted F1 Score: 0.6933

Top 10 Most Important Features:
                          feature  importance
84                discount_amount       11638
77     payment_received_diff_days        9905
96                 discount_range        2701
63        received_date_dayofweek        2464
108                 pca_feature_9        2410
