# Exploratory Data Analysis (EDA) Notebook

This notebook explores the vendor performance data to identify patterns and insights.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load cleaned data
df = pd.read_csv('../data/processed/cleaned_vendor_data.csv')

In [None]:
# Distribution of key metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

sns.histplot(data=df, x='delivery_performance', ax=axes[0,0])
axes[0,0].set_title('Distribution of Delivery Performance')

sns.histplot(data=df, x='quality_score', ax=axes[0,1])
axes[0,1].set_title('Distribution of Quality Score')

sns.histplot(data=df, x='cost_efficiency', ax=axes[1,0])
axes[1,0].set_title('Distribution of Cost Efficiency')

sns.histplot(data=df, x='responsiveness_score', ax=axes[1,1])
axes[1,1].set_title('Distribution of Responsiveness Score')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
correlation_matrix = df[['delivery_performance', 'quality_score', 'cost_efficiency', 'responsiveness_score']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Vendor Performance Metrics')
plt.show()

In [None]:
# Performance by vendor category
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='category', y='quality_score')
plt.title('Quality Score by Vendor Category')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Performance trends over time
df['metric_date'] = pd.to_datetime(df['metric_date'])
df_sorted = df.sort_values('metric_date')

plt.figure(figsize=(14, 8))
for vendor in df['vendor_name'].unique()[:5]:  # Plot top 5 vendors
    vendor_data = df_sorted[df_sorted['vendor_name'] == vendor]
    plt.plot(vendor_data['metric_date'], vendor_data['quality_score'], label=vendor, marker='o')

plt.title('Quality Score Trends Over Time (Top 5 Vendors)')
plt.xlabel('Date')
plt.ylabel('Quality Score')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Summary statistics by vendor
vendor_summary = df.groupby('vendor_name').agg({
    'delivery_performance': 'mean',
    'quality_score': 'mean',
    'cost_efficiency': 'mean',
    'responsiveness_score': 'mean',
    'metric_date': 'count'
}).round(2)

vendor_summary.rename(columns={'metric_date': 'evaluation_count'}, inplace=True)
vendor_summary.head(10)