In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder

def prepare_data(df):
    """
    Prepare dataset by handling dates and categorical variables
    """
    # Create a copy to avoid modifying original data
    data = df.copy()
    
    # Fill missing values
    data.fillna(0, inplace=True)
    
    # Convert date columns to datetime and extract features
    date_cols = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                 'released_date', 'estimated_delivery_date', 'received_date']
    
    for col in date_cols:
        if col in data.columns:
            data[col] = pd.to_datetime(data[col], errors='coerce')
            # Extract numerical features from dates
            data[f'{col}_year'] = data[col].dt.year.fillna(-1).astype(int)
            data[f'{col}_month'] = data[col].dt.month.fillna(-1).astype(int)
            data[f'{col}_day'] = data[col].dt.day.fillna(-1).astype(int)
            # Drop original date column
            data = data.drop(columns=[col])
    
    # Encode categorical variables
    categorical_cols = ['Gender', 'Is_current_loyalty_program_member', 'loyalty_tier',
                       'payment_method', 'purchase_medium', 'shipping_method',
                       'product_category']
    
    # Only encode categorical columns that exist in the dataset
    existing_cat_cols = [col for col in categorical_cols if col in data.columns]
    data = pd.get_dummies(data, columns=existing_cat_cols, drop_first=True)
    
    return data

# Load datasets
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Prepare features
X_train_full = prepare_data(train_data)
X_test = prepare_data(test_data)

# Remove non-feature columns from training data
cols_to_drop = ['customer_experience', 'user_id', 'transaction_id', 
                'order_id', 'tracking_number']
feature_cols = [col for col in X_train_full.columns 
                if col not in cols_to_drop]

X = X_train_full[feature_cols]
y = train_data['customer_experience']

# Encode target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Ensure test data has same columns as training data
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Initialize and train the model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on validation set
y_pred = model.predict(X_val)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred, 
                          target_names=le.classes_))

# Calculate weighted F1 score
weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score: {weighted_f1:.4f}")

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
})
print("\nTop 10 Most Important Features:")
print(feature_importance.sort_values('importance', 
                                   ascending=False).head(10))

# Make predictions on test set
test_predictions = model.predict(X_test)
test_predictions_labels = le.inverse_transform(test_predictions)

# Create submission file using the same format as sample_submission.csv
submission = pd.DataFrame({
    'id': range(len(test_predictions_labels)),  # Sequential IDs starting from 0
    'customer_experience': test_predictions_labels
})

# Verify submission format matches sample
print("\nVerifying submission format:")
print(f"Number of rows in submission: {len(submission)}")
print(f"Number of rows in sample submission: {len(sample_submission)}")
print("\nSubmission head:")
print(submission.head())
print("\nSample submission head:")
print(sample_submission.head())

# Verify value counts
print("\nValue counts in predictions:")
print(submission['customer_experience'].value_counts())

# Save submission file
submission.to_csv('submission2.csv', index=False)
print("\nSubmission file created successfully!")

FileNotFoundError: [Errno 2] No such file or directory: 'sample_submission.csv'