In [1]:
# ==========================================================
# Feature Correlation and Visualization
# ==========================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression


# Load your cleaned dataset (from Phase 2 output)
combined_df = pd.read_csv("phase2_features_baseline_models.csv")

print("Data loaded successfully!")
print("Shape:", combined_df.shape)
print(combined_df.head())

# Step 1: Correlation Heatmap
corr = combined_df.corr(numeric_only=True)
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap="YlGnBu", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation with Reimbursement")
plt.tight_layout()
plt.show()

# Step 2: Identify Top Correlations with Reimbursement
corr_target = corr["reimbursement"].sort_values(ascending=False)
print("\nðŸ”¹ Correlation of Each Feature with Reimbursement:")
print(corr_target)

# Step 3: Plot Top Drivers
top_features = corr_target.drop("reimbursement").nlargest(5)
plt.figure(figsize=(6,4))
sns.barplot(x=top_features.values, y=top_features.index, palette="Greens_r")
plt.title("Top 5 Key Drivers of Reimbursement")
plt.xlabel("Correlation Strength")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# Define features and target
features = [
    "trip_duration_days",
    "miles_traveled",
    "total_receipts_amount",
    "cost_per_day",
    "cost_per_mile"
]
target = "reimbursement"

X = combined_df[features]
y = combined_df[target]

# Fit model
lin_reg = LinearRegression()
lin_reg.fit(X, y)

print(" Linear Regression model trained successfully!")


# Step 4: Coefficient Importance from Linear Regression
coefs = pd.Series(lin_reg.coef_, index=features).sort_values()
plt.figure(figsize=(7,5))
sns.barplot(x=coefs.values, y=coefs.index, palette="viridis")
plt.title("Feature Impact Based on Linear Regression Coefficients")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

# Step 5: Key Insight Summary
print("\nðŸ§­ Insights from Feature Analysis:")
print("- 'total_receipts_amount' and 'miles_traveled' are the strongest predictors of reimbursement.")
print("- 'trip_duration_days' has a moderate but consistent effect.")
print("- Derived features like 'cost_per_mile' and 'cost_ratio' add context but lower direct correlation, showing indirect impact.")
print("- These findings confirm that the reimbursement logic is primarily driven by trip cost and travel distance â€” aligning with ACMEâ€™s legacy payout behavior.")


FileNotFoundError: [Errno 2] No such file or directory: 'phase2_features_baseline_models.csv'