# Data Visualization - Making Charts

## Learning Objectives
By the end of this lesson, you will be able to:
- Create basic charts (line, bar, scatter plots)
- Customize chart colors, labels, and titles
- Make charts that tell a clear story
- Choose the right chart type for your data

## Core Concepts
- **Matplotlib**: Main library for creating charts
- **Line Plot**: Shows trends over time
- **Bar Chart**: Compares categories
- **Scatter Plot**: Shows relationships between two variables
- **Histogram**: Shows distribution of data

## 1. Basic Charts

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Simple line plot
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
sales = [120, 135, 158, 142, 167, 189]

plt.figure(figsize=(8, 5))
plt.plot(months, sales, marker='o', linewidth=2)
plt.title('Monthly Sales')
plt.xlabel('Month')
plt.ylabel('Sales ($)')
plt.grid(True)
plt.show()

# Bar chart
products = ['A', 'B', 'C', 'D']
quantities = [23, 45, 56, 78]

plt.figure(figsize=(8, 5))
plt.bar(products, quantities, color=['red', 'green', 'blue', 'orange'])
plt.title('Product Sales')
plt.xlabel('Product')
plt.ylabel('Quantity Sold')
plt.show()

# Scatter plot
np.random.seed(42)
x = np.random.randn(50)
y = x + np.random.randn(50) * 0.5

plt.figure(figsize=(8, 5))
plt.scatter(x, y, alpha=0.6)
plt.title('Relationship Between X and Y')
plt.xlabel('X values')
plt.ylabel('Y values')
plt.show()

# Histogram
test_scores = np.random.normal(75, 10, 100)  # Mean=75, std=10

plt.figure(figsize=(8, 5))
plt.hist(test_scores, bins=15, alpha=0.7, color='skyblue')
plt.title('Test Score Distribution')
plt.xlabel('Score')
plt.ylabel('Number of Students')
plt.axvline(np.mean(test_scores), color='red', linestyle='--', label='Average')
plt.legend()
plt.show()

print("Basic charts: line, bar, scatter, histogram")

## 2. Better Charts with Seaborn

In [None]:
import seaborn as sns

# Create sample data
np.random.seed(42)
employees = pd.DataFrame({
    'department': np.repeat(['Sales', 'Marketing', 'IT', 'HR'], 25),
    'salary': np.concatenate([
        np.random.normal(65000, 10000, 25),  # Sales
        np.random.normal(58000, 8000, 25),   # Marketing  
        np.random.normal(75000, 12000, 25),  # IT
        np.random.normal(55000, 7000, 25)    # HR
    ]),
    'experience': np.random.randint(0, 15, 100)
})

# Box plot - compare groups
plt.figure(figsize=(8, 5))
sns.boxplot(data=employees, x='department', y='salary')
plt.title('Salary by Department')
plt.xticks(rotation=45)
plt.show()

# Correlation heatmap
plt.figure(figsize=(6, 5))
numeric_data = employees.select_dtypes(include=[np.number])
correlation = numeric_data.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Scatter plot with categories
plt.figure(figsize=(8, 5))
sns.scatterplot(data=employees, x='experience', y='salary', hue='department')
plt.title('Salary vs Experience by Department')
plt.show()

# Distribution plot
plt.figure(figsize=(8, 5))
sns.histplot(data=employees, x='salary', kde=True)
plt.title('Salary Distribution')
plt.show()

print("Seaborn makes prettier charts with less code!")

## 3. Multiple Charts and Customization

In [None]:
# Multiple charts in one figure
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Chart 1: Line plot
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
revenue = [100, 120, 140, 110, 160, 180]
axes[0,0].plot(months, revenue, marker='o')
axes[0,0].set_title('Monthly Revenue')

# Chart 2: Bar chart
regions = ['North', 'South', 'East', 'West']
sales = [250, 300, 200, 180]
axes[0,1].bar(regions, sales, color='lightblue')
axes[0,1].set_title('Sales by Region')

# Chart 3: Pie chart
expenses = [40, 30, 20, 10]
labels = ['Salaries', 'Rent', 'Marketing', 'Other']
axes[1,0].pie(expenses, labels=labels, autopct='%1.1f%%')
axes[1,0].set_title('Expense Breakdown')

# Chart 4: Scatter with trend
x = np.arange(10)
y = x * 2 + np.random.randn(10) * 2
axes[1,1].scatter(x, y)
axes[1,1].plot(x, x*2, 'r--', alpha=0.7)
axes[1,1].set_title('Sales vs Advertising')

plt.tight_layout()
plt.show()

# Time series with two y-axes
dates = pd.date_range('2023-01-01', periods=30, freq='D')
price = 100 + np.cumsum(np.random.randn(30) * 0.5)
volume = np.random.randint(1000, 5000, 30)

fig, ax1 = plt.subplots(figsize=(10, 5))
ax2 = ax1.twinx()

# Price on left axis
ax1.plot(dates, price, 'b-', label='Price')
ax1.set_ylabel('Price ($)', color='b')
ax1.tick_params(axis='y', labelcolor='b')

# Volume on right axis  
ax2.bar(dates, volume, alpha=0.3, color='r', label='Volume')
ax2.set_ylabel('Volume', color='r')
ax2.tick_params(axis='y', labelcolor='r')

plt.title('Stock Price and Volume')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Customizing appearance
plt.figure(figsize=(8, 5))
x = [1, 2, 3, 4, 5]
y = [10, 25, 30, 35, 40]

plt.plot(x, y, marker='s', markersize=8, linewidth=3, color='#FF6B6B')
plt.fill_between(x, y, alpha=0.3, color='#FF6B6B')
plt.title('Custom Styled Chart', fontsize=16, fontweight='bold')
plt.xlabel('Time Period', fontsize=12)
plt.ylabel('Value', fontsize=12)
plt.grid(True, linestyle=':', alpha=0.6)
plt.show()

print("Advanced charts: subplots, dual axes, custom styling")

# Practice Exercises

Complete these exercises to master data visualization concepts:

In [None]:
# Exercise 1: Sales dashboard
sales_data = pd.DataFrame({
    'month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'],
    'online': [120, 135, 158, 142, 167, 189],
    'store': [98, 112, 125, 108, 134, 145],
    'total_customers': [500, 520, 580, 520, 610, 670]
})

# Create a 2x2 dashboard
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Monthly trends
axes[0,0].plot(sales_data['month'], sales_data['online'], 'o-', label='Online')
axes[0,0].plot(sales_data['month'], sales_data['store'], 's-', label='Store')
axes[0,0].set_title('Sales Trends')
axes[0,0].legend()

# Total sales by channel
total_online = sales_data['online'].sum()
total_store = sales_data['store'].sum()
axes[0,1].bar(['Online', 'Store'], [total_online, total_store], color=['blue', 'orange'])
axes[0,1].set_title('Total Sales by Channel')

# Customer growth
axes[1,0].plot(sales_data['month'], sales_data['total_customers'], 'g-', marker='o')
axes[1,0].set_title('Customer Growth')
axes[1,0].set_ylabel('Number of Customers')

# Sales distribution
all_sales = list(sales_data['online']) + list(sales_data['store'])
axes[1,1].hist(all_sales, bins=8, alpha=0.7, color='purple')
axes[1,1].set_title('Sales Distribution')

plt.tight_layout()
plt.show()

# Exercise 2: Student performance analysis
students = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank'],
    'math': [85, 78, 92, 88, 76, 95],
    'science': [88, 82, 89, 91, 79, 93],
    'english': [92, 85, 87, 89, 88, 90],
    'grade_level': [9, 10, 9, 10, 9, 10]
})

students['average'] = students[['math', 'science', 'english']].mean(axis=1)

# Subject comparison
subjects = ['math', 'science', 'english']
averages = [students[subject].mean() for subject in subjects]

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.bar(subjects, averages, color=['red', 'green', 'blue'])
plt.title('Average Scores by Subject')
plt.ylabel('Average Score')

plt.subplot(1, 2, 2)
sns.scatterplot(data=students, x='math', y='science', size='english', 
                hue='grade_level', sizes=(50, 200))
plt.title('Math vs Science (size=English, color=Grade)')

plt.tight_layout()
plt.show()

# Exercise 3: Weather analysis
weather = pd.DataFrame({
    'day': range(1, 15),
    'temperature': [22, 25, 23, 27, 24, 26, 21, 28, 25, 23, 29, 26, 24, 27],
    'humidity': [60, 55, 65, 50, 62, 48, 70, 45, 58, 67, 42, 52, 61, 49],
    'rainfall': [0, 0, 5, 0, 12, 0, 8, 0, 0, 15, 0, 0, 3, 0]
})

plt.figure(figsize=(12, 4))

# Temperature and humidity
plt.subplot(1, 3, 1)
plt.plot(weather['day'], weather['temperature'], 'ro-', label='Temperature')
plt.xlabel('Day')
plt.ylabel('Temperature (°C)')
plt.title('Daily Temperature')
plt.grid(True)

plt.subplot(1, 3, 2)
plt.scatter(weather['temperature'], weather['humidity'], alpha=0.7)
plt.xlabel('Temperature (°C)')
plt.ylabel('Humidity (%)')
plt.title('Temperature vs Humidity')

plt.subplot(1, 3, 3)
rainy_days = weather[weather['rainfall'] > 0]
plt.bar(rainy_days['day'], rainy_days['rainfall'], color='lightblue')
plt.xlabel('Day')
plt.ylabel('Rainfall (mm)')
plt.title('Rainy Days')

plt.tight_layout()
plt.show()

print("Practice exercises: dashboards, comparisons, relationships")