# Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on the Customers dataset.

## Objectives:
1. Load and inspect the dataset
2. Analyze missing values and data quality
3. Generate summary statistics
4. Create visualizations
5. Perform data quality checks

## 1. Import Libraries and Setup

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Suppress warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Set paths
data_path = Path('../../data/Customers.csv')
results_path = Path('../../results')
results_path.mkdir(exist_ok=True)

print("Libraries imported successfully!")
print(f"Data path: {data_path}")
print(f"Results path: {results_path}")

## 2. Load Data


In [None]:
# Load the dataset
df = pd.read_csv(data_path)

print("=" * 50)
print("DATASET LOADED SUCCESSFULLY")
print("=" * 50)
print(f"\nDataset Shape: {df.shape}")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print(f"\nColumn Names: {df.columns.tolist()}")


## 3. Data Overview


In [None]:
# Display dataset information
print("=" * 50)
print("DATASET INFORMATION")
print("=" * 50)
df.info()


In [None]:
# Display first few rows
print("\n=== First 5 Rows ===")
df.head()


## 4. Missing Values Analysis


In [None]:
# Analyze missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percentage.values
})

print("=" * 50)
print("MISSING VALUES ANALYSIS")
print("=" * 50)
result_df = missing_df[missing_df['Missing Count'] > 0]
if len(result_df) == 0:
    print("✓ No missing values found in the dataset!")
else:
    print(result_df)

missing_df


## 5. Duplicate Records Analysis


In [None]:
# Check for duplicate records
duplicate_count = df.duplicated().sum()
print("=" * 50)
print("DUPLICATE RECORDS ANALYSIS")
print("=" * 50)
print(f"Number of duplicate records: {duplicate_count}")
if duplicate_count > 0:
    print("\nDuplicate records:")
    print(df[df.duplicated()])
else:
    print("✓ No duplicate records found!")


## 6. Summary Statistics


In [None]:
# Summary statistics for numerical columns
print("=" * 50)
print("NUMERICAL COLUMNS SUMMARY")
print("=" * 50)
df.describe()


In [None]:
# Summary statistics for categorical columns
print("=" * 50)
print("CATEGORICAL COLUMNS SUMMARY")
print("=" * 50)
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Most frequent: {df[col].mode()[0] if len(df[col].mode()) > 0 else 'N/A'}")
    print(f"  Frequency: {df[col].value_counts().iloc[0] if len(df[col].value_counts()) > 0 else 'N/A'}")


## 7. Data Visualizations


In [None]:
# Customer ID distribution
plt.figure(figsize=(12, 6))
plt.hist(df['CustomerID'], bins=20, edgecolor='black', alpha=0.7, color='steelblue')
plt.xlabel('Customer ID')
plt.ylabel('Frequency')
plt.title('Distribution of Customer IDs')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(results_path / 'customer_id_distribution.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Country distribution
country_counts = df['Country'].value_counts()
plt.figure(figsize=(14, 8))
country_counts.plot(kind='bar', color='steelblue', edgecolor='black')
plt.xlabel('Country')
plt.ylabel('Number of Customers')
plt.title('Customer Distribution by Country')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(results_path / 'country_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nTop 5 Countries:")
print(country_counts.head())


In [None]:
# City distribution (top 20)
city_counts = df['City'].value_counts().head(20)
plt.figure(figsize=(14, 8))
city_counts.plot(kind='barh', color='coral', edgecolor='black')
plt.xlabel('Number of Customers')
plt.ylabel('City')
plt.title('Top 20 Cities by Customer Count')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig(results_path / 'city_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nTop 10 Cities:")
print(df['City'].value_counts().head(10))


## 8. Data Quality Checks


In [None]:
# Check for empty strings
print("=" * 50)
print("DATA QUALITY CHECKS")
print("=" * 50)
print("\n=== Empty String Check ===")
empty_strings = {}
for col in df.columns:
    empty_count = (df[col] == '').sum()
    if empty_count > 0:
        empty_strings[col] = empty_count
        print(f"{col}: {empty_count} empty strings")
if not empty_strings:
    print("✓ No empty strings found")

# Check data consistency
print("\n=== Data Consistency Check ===")
print(f"Unique Customer IDs: {df['CustomerID'].nunique()}")
print(f"Total Records: {len(df)}")
if df['CustomerID'].nunique() == len(df):
    print("✓ All Customer IDs are unique")
else:
    print("⚠ Warning: Duplicate Customer IDs found")


## 9. Summary and Key Insights


In [None]:
# EDA Summary
print("=" * 50)
print("EDA SUMMARY")
print("=" * 50)
print(f"\n1. Dataset contains {df.shape[0]} customers with {df.shape[1]} attributes")
print(f"2. Countries represented: {df['Country'].nunique()}")
print(f"3. Cities represented: {df['City'].nunique()}")
print(f"4. Missing values: {df.isnull().sum().sum()}")
print(f"5. Duplicate records: {df.duplicated().sum()}")

if len(df['Country'].mode()) > 0:
    print(f"\n6. Most common country: {df['Country'].mode()[0]} ({df['Country'].value_counts().iloc[0]} customers)")
if len(df['City'].mode()) > 0:
    print(f"7. Most common city: {df['City'].mode()[0]} ({df['City'].value_counts().iloc[0]} customers)")

print("\n=== Key Insights ===")
print("• The dataset appears to be clean with no missing values")
print("• All customer IDs are unique")
print("• The dataset has good geographical diversity")
print("• Ready for further statistical and ML analysis")
print("\n✓ Visualizations saved to results/ directory")
