In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, f1_score

# Load datasets
orders_path = '/content/drive/MyDrive/Affinity Task 3/orders.csv'  # Replace with your file path
submission_path = '/content/drive/MyDrive/Affinity Task 3/c3_submission_form.csv'  # Replace with your file path
orders_data = pd.read_csv(orders_path)
submission_data = pd.read_csv(submission_path)

# Drop irrelevant columns
columns_to_drop = [
    'dim_order_key', 'dim_organization_key', 'dim_product_key',
    'dim_status_key', 'dim_team_key', 'dim_user_key', 'dsp',
    'src_modified_by', 'src_modified_on', 'src_record_updated_time',
    'src_record_updated_time_utc', 'stage_id', 'status_key',
    'primary_manager_user_key', 'secondry_manager_user_key'
]
orders_data_cleaned = orders_data.drop(columns=columns_to_drop, errors='ignore')

# Process date/time columns conditionally
date_columns = ['customer_intime', 'startdate', 'order_due_time', 'order_in_time']
for col in date_columns:
    if col in orders_data_cleaned.columns:
        orders_data_cleaned[col] = pd.to_datetime(orders_data_cleaned[col], errors='coerce')
        orders_data_cleaned[f'{col}_dayofweek'] = orders_data_cleaned[col].dt.dayofweek
        orders_data_cleaned[f'{col}_hourofday'] = orders_data_cleaned[col].dt.hour
        orders_data_cleaned = orders_data_cleaned.drop(columns=[col])

# Frequency encode categorical columns
categorical_columns = orders_data_cleaned.select_dtypes(include=['object']).columns
for col in categorical_columns:
    freq_encoding = orders_data_cleaned[col].value_counts(normalize=True)
    orders_data_cleaned[col] = orders_data_cleaned[col].map(freq_encoding)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(orders_data_cleaned.drop(columns=['is_late'], errors='ignore'))
y_train = orders_data_cleaned['is_late']

# Train the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Preprocess test data
submission_data_cleaned = submission_data.drop(columns=columns_to_drop, errors='ignore')
for col in date_columns:
    if col in submission_data_cleaned.columns:
        submission_data_cleaned[col] = pd.to_datetime(submission_data_cleaned[col], errors='coerce')
        submission_data_cleaned[f'{col}_dayofweek'] = submission_data_cleaned[col].dt.dayofweek
        submission_data_cleaned[f'{col}_hourofday'] = submission_data_cleaned[col].dt.hour
        submission_data_cleaned = submission_data_cleaned.drop(columns=[col])

# Frequency encoding for categorical columns in the test dataset
for col in categorical_columns:
    if col in submission_data_cleaned.columns:  # Only process if the column exists
        freq_encoding = orders_data_cleaned[col].value_counts(normalize=True)
        submission_data_cleaned[col] = submission_data_cleaned[col].map(freq_encoding)

# Align columns between training and test datasets
missing_cols = set(orders_data_cleaned.columns) - set(submission_data_cleaned.columns)
for col in missing_cols:
    submission_data_cleaned[col] = 0  # Add missing columns with default value

extra_cols = set(submission_data_cleaned.columns) - set(orders_data_cleaned.columns)
submission_data_cleaned = submission_data_cleaned.drop(columns=extra_cols, errors='ignore')

# Match feature names explicitly
train_features = orders_data_cleaned.drop(columns=['is_late'], errors='ignore').columns
submission_data_cleaned = submission_data_cleaned[train_features]

# Impute missing values in the test dataset
X_test_submission = imputer.transform(submission_data_cleaned)

# Predict outcomes for the test dataset
submission_predictions = rf_model.predict(X_test_submission)

# Add predictions to the submission dataset
submission_data['is_late'] = submission_predictions

# Save predictions to a CSV file
output_path = 'submission_predictions.csv'  # Output file path
submission_data[['dim_order_key', 'is_late']].to_csv(output_path, index=False)

print(f"Predictions saved to {output_path}")


Predictions saved to submission_predictions.csv


In [None]:
from sklearn.metrics import f1_score

# For training data
y_pred_train = rf_model.predict(X_train)  # Predict on the training set
f1_training_score = f1_score(y_train, y_pred_train)  # Calculate F1 Score for training data
print(f"F1 Score on Training Data: {f1_training_score}")

# For test data
y_pred_test = rf_model.predict(X_test_submission)  # Predict on the test set
# NOTE: Test data doesn't have ground truth labels; this is just for consistency.
# You need true labels (if available) for test F1 score calculation.


F1 Score on Training Data: 0.9972013551333039


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# Cost Parameters
cost_fp = 300  # Cost of assigning extra resources for predicted late orders
cost_fn = 700  # Penalty for failing to predict late deliveries

# Predicted probabilities for the test dataset
submission_predictions_proba = rf_model.predict_proba(X_test_submission)[:, 1]  # Probability of being late

# Define thresholds to evaluate
thresholds = np.linspace(0.1, 0.9, 9)  # Evaluate thresholds from 0.1 to 0.9
total_costs = []

# Simulate predictions and calculate total cost for each threshold
for threshold in thresholds:
    # Adjust predictions based on the threshold
    adjusted_predictions = (submission_predictions_proba >= threshold).astype(int)

    # Calculate confusion matrix components
    tn, fp, fn, tp = confusion_matrix(submission_data['is_late'], adjusted_predictions).ravel()

    # Compute total cost
    total_cost = (fp * cost_fp) + (fn * cost_fn)
    total_costs.append((threshold, total_cost))

# Find the optimal threshold with minimum cost
optimal_threshold, min_cost = min(total_costs, key=lambda x: x[1])

# Output results
print(f"Optimal Threshold: {optimal_threshold}")
print(f"Minimum Total Cost: ${min_cost}")

# Final predictions using the optimal threshold
final_predictions = (submission_predictions_proba >= optimal_threshold).astype(int)

# Evaluate the model at the optimal threshold
print("\nClassification Report at Optimal Threshold:")
print(classification_report(submission_data['is_late'], final_predictions))

# Cost breakdown
predicted_late = sum(final_predictions)
actual_late = sum(submission_data['is_late'])
cost_late_orders = predicted_late * cost_fp
cost_failed_predictions = sum((submission_data['is_late'] != final_predictions) & (submission_data['is_late'] == 1)) * cost_fn

print(f"Predicted Late Orders: {predicted_late}")
print(f"Actual Late Orders: {actual_late}")
print(f"Cost of Assigning Extra Resources: ${cost_late_orders}")
print(f"Cost of Failed Predictions (Penalties): ${cost_failed_predictions}")
print(f"Total Cost at Optimal Threshold: ${cost_late_orders + cost_failed_predictions}")


Optimal Threshold: 0.5
Minimum Total Cost: $36600

Classification Report at Optimal Threshold:
              precision    recall  f1-score   support

       False       1.00      0.93      0.96      1698
        True       0.93      1.00      0.97      1700

    accuracy                           0.96      3398
   macro avg       0.97      0.96      0.96      3398
weighted avg       0.97      0.96      0.96      3398

Predicted Late Orders: 1822
Actual Late Orders: 1700
Cost of Assigning Extra Resources: $546600
Cost of Failed Predictions (Penalties): $0
Total Cost at Optimal Threshold: $546600
