# 01 - Data Exploration

This notebook demonstrates basic data exploration using Pandas and visualization.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries loaded successfully!")

## Load Customer Data

In [None]:
# Load customers dataset
customers = pd.read_csv('../data/customers.csv')

print(f"Dataset shape: {customers.shape}")
print("\nFirst 5 rows:")
customers.head()

## Data Profiling

In [None]:
# Basic statistics
print("=== DATASET INFO ===")
customers.info()

print("\n=== NUMERIC SUMMARY ===")
customers.describe()

In [None]:
# Check for missing values
print("=== MISSING VALUES ===")
customers.isnull().sum()

## Visualizations

In [None]:
# Age distribution
plt.figure(figsize=(10, 6))
plt.hist(customers['age'], bins=30, edgecolor='black', alpha=0.7)
plt.title('Age Distribution of Customers')
plt.xlabel('Age')
plt.ylabel('Count')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Top cities
plt.figure(figsize=(12, 6))
city_counts = customers['city'].value_counts().head(10)
city_counts.plot(kind='bar')
plt.title('Top 10 Cities by Customer Count')
plt.xlabel('City')
plt.ylabel('Number of Customers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Load and Explore Sales Data

In [None]:
# Load sales dataset
sales = pd.read_csv('../data/sales.csv')

print(f"Sales dataset shape: {sales.shape}")
print("\nFirst 5 rows:")
sales.head()

In [None]:
# Sales by category
plt.figure(figsize=(10, 6))
category_sales = sales.groupby('category')['amount'].sum().sort_values(ascending=False)
category_sales.plot(kind='bar')
plt.title('Total Sales by Category')
plt.xlabel('Category')
plt.ylabel('Total Sales ($)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Sales distribution
plt.figure(figsize=(10, 6))
plt.hist(sales['amount'], bins=50, edgecolor='black', alpha=0.7)
plt.title('Distribution of Sale Amounts')
plt.xlabel('Amount ($)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

## Key Insights

1. **Customer Demographics**: Age distribution shows typical adult range
2. **Geographic Distribution**: Customers spread across major US cities
3. **Sales Patterns**: Electronics dominates by revenue
4. **Transaction Size**: Most sales are under $500