In [ ]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load the prepared data
prepared_data_path = 'C:/Users/ADMIN/ET6-CDSP-group-17-repo/2_data_preparation/ASOS_GraphReturns/prepared_asos_data.csv'
try:
    df = pd.read_csv(prepared_data_path)
    print(f"Successfully loaded prepared data from {prepared_data_path}")
except FileNotFoundError:
    print(f"Error: {prepared_data_path} not found. Please ensure the data preparation step was completed.")
    exit()

print("
--- Initial Data Overview for Analysis ---")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())


In [ ]:
# Define features (X) and target (y)
# Drop ID columns and the target variable from features
# Ensure to drop any columns that were not one-hot encoded but are categorical strings
# Check for columns that might have been left as objects after one-hot encoding in preparation
object_cols = df.select_dtypes(include='object').columns.tolist()
cols_to_drop = ['customer_id', 'variant_id', 'product_id', 'supplier_ref_id', 'isReturned'] + object_cols

# Filter out columns that don't exist in the DataFrame
cols_to_drop = [col for col in cols_to_drop if col in df.columns]

X = df.drop(columns=cols_to_drop, errors='ignore')
y = df['isReturned']

print("
--- Features and Target Variable Setup ---")
print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)
print("X columns (first 10):", X.columns.tolist()[:10])
print("y value counts:
", y.value_counts())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("
--- Train-Test Split ---")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train value counts:
", y_train.value_counts(normalize=True))
print("y_test value counts:
", y_test.value_counts(normalize=True))


In [ ]:
# Initialize and train the RandomForestClassifier model
# Using parameters from the original notebook for consistency, but can be tuned
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1) # n_jobs=-1 to use all available cores

print("
--- Training RandomForestClassifier Model ---")
model.fit(X_train, y_train)
print("Model training complete.")


In [ ]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] # Probabilities for the positive class (isReturned = 1)

print("
--- Model Evaluation ---")
print("Classification Report:
", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# ROC Curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [ ]:
# Feature Importance
importances = pd.Series(model.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(15)

plt.figure(figsize=(10, 7))
sns.barplot(x=top_features.values, y=top_features.index, palette='viridis')
plt.title('Top 15 Features Influencing Product Returns')
plt.xlabel('Feature Importance Score')
plt.ylabel('Feature')
plt.gca().invert_yaxis() # Invert y-axis to have the most important feature at the top
plt.tight_layout()
plt.show()
