# Python for Data Science
## NumPy, Pandas, and Matplotlib Complete Guide

---

## Table of Contents
1. [NumPy Fundamentals](#numpy)
2. [Pandas Basics](#pandas-basics)
3. [Pandas Data Manipulation](#pandas-manipulation)
4. [Matplotlib Visualization](#matplotlib)
5. [Seaborn for Statistical Plots](#seaborn)
6. [Data Analysis Workflow](#workflow)
7. [Practice Problems](#practice)

---
# 1. NUMPY FUNDAMENTALS

## 1.1 Creating Arrays

In [None]:
import numpy as np

# From Python list
arr = np.array([1, 2, 3, 4, 5])
print(f"1D array: {arr}")
print(f"Type: {type(arr)}")
print(f"Dtype: {arr.dtype}")
print(f"Shape: {arr.shape}")

# 2D array (matrix)
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(f"\n2D array:\n{matrix}")
print(f"Shape: {matrix.shape}")

# Specify dtype
arr_float = np.array([1, 2, 3], dtype=np.float64)
print(f"\nFloat array: {arr_float}")

In [None]:
# Special arrays
zeros = np.zeros((3, 4))
print(f"Zeros (3x4):\n{zeros}")

ones = np.ones((2, 3))
print(f"\nOnes (2x3):\n{ones}")

identity = np.eye(3)
print(f"\nIdentity (3x3):\n{identity}")

full = np.full((2, 3), 7)
print(f"\nFull of 7s (2x3):\n{full}")

# Ranges
arange = np.arange(0, 10, 2)
print(f"\nArange (0 to 10, step 2): {arange}")

linspace = np.linspace(0, 1, 5)
print(f"Linspace (0 to 1, 5 points): {linspace}")

In [None]:
# Random arrays
np.random.seed(42)  # For reproducibility

rand = np.random.rand(3, 3)  # Uniform [0, 1)
print(f"Random uniform:\n{rand}")

randn = np.random.randn(3, 3)  # Standard normal
print(f"\nRandom normal:\n{randn}")

randint = np.random.randint(0, 10, size=(3, 3))
print(f"\nRandom integers 0-9:\n{randint}")

choice = np.random.choice([1, 2, 3, 4, 5], size=10)
print(f"\nRandom choice: {choice}")

## 1.2 Indexing and Slicing

In [None]:
arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
print(f"Array:\n{arr}")

# Indexing
print(f"\narr[0, 0] = {arr[0, 0]}")
print(f"arr[1, 2] = {arr[1, 2]}")
print(f"arr[-1, -1] = {arr[-1, -1]}")

# Slicing
print(f"\nFirst row: {arr[0, :]}")
print(f"First column: {arr[:, 0]}")
print(f"Top-left 2x2: \n{arr[:2, :2]}")

# Step slicing
print(f"\nEvery other element: {arr[0, ::2]}")

In [None]:
# Boolean indexing
arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
print(f"Array: {arr}")

# Get elements > 5
mask = arr > 5
print(f"Mask (>5): {mask}")
print(f"Elements >5: {arr[mask]}")

# Complex conditions
print(f"Even numbers: {arr[arr % 2 == 0]}")
print(f"3 < x < 8: {arr[(arr > 3) & (arr < 8)]}")

# Fancy indexing
indices = [0, 2, 4]
print(f"\nIndices [0, 2, 4]: {arr[indices]}")

## 1.3 Array Operations

In [None]:
a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])

# Element-wise operations
print(f"a = {a}")
print(f"b = {b}")
print(f"\na + b = {a + b}")
print(f"a - b = {a - b}")
print(f"a * b = {a * b}")
print(f"a / b = {a / b}")
print(f"a ** 2 = {a ** 2}")
print(f"np.sqrt(a) = {np.sqrt(a)}")

# Comparison
print(f"\na > 2: {a > 2}")
print(f"a == b: {a == b}")

In [None]:
# Aggregate functions
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(f"Array:\n{arr}")

print(f"\nSum: {arr.sum()}")
print(f"Mean: {arr.mean()}")
print(f"Std: {arr.std():.2f}")
print(f"Min: {arr.min()}")
print(f"Max: {arr.max()}")

# Along axis
print(f"\nSum along rows (axis=1): {arr.sum(axis=1)}")
print(f"Sum along cols (axis=0): {arr.sum(axis=0)}")
print(f"Mean of each row: {arr.mean(axis=1)}")

## 1.4 Broadcasting

In [None]:
# Broadcasting: operations on arrays of different shapes

# Scalar and array
arr = np.array([1, 2, 3, 4])
print(f"arr * 2 = {arr * 2}")

# 1D and 2D
matrix = np.array([[1, 2, 3], [4, 5, 6]])
row = np.array([10, 20, 30])
print(f"\nMatrix:\n{matrix}")
print(f"Row: {row}")
print(f"Matrix + Row:\n{matrix + row}")

# Column broadcasting
col = np.array([[100], [200]])
print(f"\nColumn:\n{col}")
print(f"Matrix + Column:\n{matrix + col}")

## 1.5 Reshaping and Transposing

In [None]:
arr = np.arange(12)
print(f"Original: {arr}")

# Reshape
reshaped = arr.reshape(3, 4)
print(f"\nReshaped (3x4):\n{reshaped}")

reshaped = arr.reshape(2, 2, 3)
print(f"\nReshaped (2x2x3):\n{reshaped}")

# Use -1 for automatic dimension
reshaped = arr.reshape(-1, 3)  # 4 rows
print(f"\nReshaped (-1, 3):\n{reshaped}")

# Flatten
flat = reshaped.flatten()
print(f"\nFlattened: {flat}")

# Transpose
matrix = np.array([[1, 2, 3], [4, 5, 6]])
print(f"\nOriginal:\n{matrix}")
print(f"Transposed:\n{matrix.T}")

## 1.6 Linear Algebra

In [None]:
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

print(f"A:\n{A}")
print(f"B:\n{B}")

# Matrix multiplication
print(f"\nA @ B:\n{A @ B}")
print(f"np.dot(A, B):\n{np.dot(A, B)}")

# Determinant
print(f"\nDeterminant of A: {np.linalg.det(A)}")

# Inverse
A_inv = np.linalg.inv(A)
print(f"\nInverse of A:\n{A_inv}")
print(f"A @ A_inv:\n{A @ A_inv}")

# Eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(A)
print(f"\nEigenvalues: {eigenvalues}")
print(f"Eigenvectors:\n{eigenvectors}")

---
# 2. PANDAS BASICS

## 2.1 Series and DataFrame

In [None]:
import pandas as pd

# Series - 1D labeled array
s = pd.Series([1, 2, 3, 4, 5])
print("Series:")
print(s)

# With custom index
s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
print("\nSeries with index:")
print(s)
print(f"\ns['b'] = {s['b']}")

# From dictionary
d = {'name': 'Alice', 'age': 25, 'city': 'NYC'}
s = pd.Series(d)
print("\nSeries from dict:")
print(s)

In [None]:
# DataFrame - 2D labeled data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 28],
    'City': ['NYC', 'LA', 'Chicago', 'Houston'],
    'Salary': [50000, 60000, 70000, 55000]
}

df = pd.DataFrame(data)
print("DataFrame:")
print(df)

print(f"\nShape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Index: {list(df.index)}")
print(f"Dtypes:\n{df.dtypes}")

In [None]:
# Quick look at data
print("Head (first 2 rows):")
print(df.head(2))

print("\nTail (last 2 rows):")
print(df.tail(2))

print("\nInfo:")
df.info()

print("\nDescribe (numerical columns):")
print(df.describe())

## 2.2 Selecting Data

In [None]:
# Column selection
print("Single column (Series):")
print(df['Name'])

print("\nMultiple columns (DataFrame):")
print(df[['Name', 'Age']])

# Row selection by index
print("\nRows 1-2:")
print(df[1:3])

In [None]:
# loc - label-based selection
print("loc[0] - Row 0:")
print(df.loc[0])

print("\nloc[0:2, 'Name':'City']:")
print(df.loc[0:2, 'Name':'City'])

# iloc - integer-based selection
print("\niloc[0] - Row 0:")
print(df.iloc[0])

print("\niloc[0:2, 0:2]:")
print(df.iloc[0:2, 0:2])

In [None]:
# Boolean selection
print("Age > 28:")
print(df[df['Age'] > 28])

print("\nSalary >= 55000 AND City = NYC or LA:")
print(df[(df['Salary'] >= 55000) & (df['City'].isin(['NYC', 'LA']))])

# Query method
print("\nQuery: Age > 25 and Salary > 55000:")
print(df.query('Age > 25 and Salary > 55000'))

## 2.3 Adding and Modifying Data

In [None]:
df_copy = df.copy()

# Add new column
df_copy['Bonus'] = df_copy['Salary'] * 0.1
print("With Bonus column:")
print(df_copy)

# Modify existing column
df_copy['Salary'] = df_copy['Salary'] * 1.05
print("\nAfter 5% raise:")
print(df_copy)

# Add new row
new_row = {'Name': 'Eve', 'Age': 27, 'City': 'Miami', 'Salary': 57750, 'Bonus': 5500}
df_copy = pd.concat([df_copy, pd.DataFrame([new_row])], ignore_index=True)
print("\nAfter adding Eve:")
print(df_copy)

In [None]:
# Drop columns
df_dropped = df_copy.drop(columns=['Bonus'])
print("After dropping Bonus:")
print(df_dropped)

# Drop rows
df_dropped = df_copy.drop(index=[4])
print("\nAfter dropping row 4:")
print(df_dropped)

# Rename columns
df_renamed = df.rename(columns={'Name': 'Employee', 'Salary': 'Annual_Salary'})
print("\nRenamed columns:")
print(df_renamed)

---
# 3. PANDAS DATA MANIPULATION

## 3.1 Handling Missing Data

In [None]:
# Create data with missing values
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
}
df_missing = pd.DataFrame(data)
print("Data with missing values:")
print(df_missing)

# Check for missing values
print("\nIs null:")
print(df_missing.isnull())
print(f"\nNull count per column:\n{df_missing.isnull().sum()}")

# Drop rows with missing values
print("\nDrop rows with any NaN:")
print(df_missing.dropna())

# Fill missing values
print("\nFill with 0:")
print(df_missing.fillna(0))

print("\nFill with mean:")
print(df_missing.fillna(df_missing.mean()))

print("\nForward fill:")
print(df_missing.fillna(method='ffill'))

## 3.2 Groupby Operations

In [None]:
# Create sample data
data = {
    'Department': ['Sales', 'Sales', 'IT', 'IT', 'HR', 'HR'],
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
    'Salary': [50000, 55000, 65000, 70000, 48000, 52000],
    'Years': [3, 5, 4, 7, 2, 4]
}
df = pd.DataFrame(data)
print("Data:")
print(df)

# Group by department
grouped = df.groupby('Department')

print("\nMean by department:")
print(grouped.mean())

print("\nSum by department:")
print(grouped.sum())

print("\nCount by department:")
print(grouped.count())

In [None]:
# Multiple aggregations
print("Multiple aggregations:")
print(grouped['Salary'].agg(['mean', 'min', 'max', 'std']))

# Named aggregations
print("\nNamed aggregations:")
result = grouped.agg(
    avg_salary=('Salary', 'mean'),
    max_salary=('Salary', 'max'),
    avg_years=('Years', 'mean')
)
print(result)

## 3.3 Merging and Joining

In [None]:
# Create two DataFrames
df1 = pd.DataFrame({
    'ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David']
})

df2 = pd.DataFrame({
    'ID': [1, 2, 3, 5],
    'Score': [85, 90, 78, 92]
})

print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)

# Inner join (default)
print("\nInner join:")
print(pd.merge(df1, df2, on='ID'))

# Left join
print("\nLeft join:")
print(pd.merge(df1, df2, on='ID', how='left'))

# Outer join
print("\nOuter join:")
print(pd.merge(df1, df2, on='ID', how='outer'))

## 3.4 Pivot Tables

In [None]:
# Create sales data
data = {
    'Date': pd.date_range('2024-01-01', periods=12, freq='M'),
    'Product': ['A', 'B', 'A', 'B'] * 3,
    'Region': ['North', 'North', 'South', 'South'] * 3,
    'Sales': [100, 150, 200, 180, 120, 160, 220, 190, 140, 170, 240, 200]
}
df_sales = pd.DataFrame(data)
print("Sales data:")
print(df_sales)

# Pivot table
pivot = pd.pivot_table(
    df_sales,
    values='Sales',
    index='Product',
    columns='Region',
    aggfunc='sum'
)
print("\nPivot table (sum of sales):")
print(pivot)

# Multiple aggregations
pivot = pd.pivot_table(
    df_sales,
    values='Sales',
    index='Product',
    columns='Region',
    aggfunc=['sum', 'mean']
)
print("\nPivot table (sum and mean):")
print(pivot)

## 3.5 Apply and Transform

In [None]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Salary': [50000, 60000, 70000]
})
print("Original:")
print(df)

# Apply to column
df['Age_Squared'] = df['Age'].apply(lambda x: x**2)
print("\nWith Age_Squared:")
print(df)

# Apply to row
df['Total'] = df.apply(lambda row: row['Age'] + row['Salary']/1000, axis=1)
print("\nWith Total:")
print(df)

# Map for Series
df['Age_Group'] = df['Age'].map({25: 'Young', 30: 'Middle', 35: 'Senior'})
print("\nWith Age_Group:")
print(df)

---
# 4. MATPLOTLIB VISUALIZATION

## 4.1 Basic Plots

In [None]:
import matplotlib.pyplot as plt

# Line plot
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.figure(figsize=(10, 4))
plt.plot(x, y, 'b-', linewidth=2, label='sin(x)')
plt.plot(x, np.cos(x), 'r--', linewidth=2, label='cos(x)')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Sine and Cosine Functions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Scatter plot
np.random.seed(42)
x = np.random.randn(100)
y = x + np.random.randn(100) * 0.5
colors = np.random.rand(100)
sizes = np.random.rand(100) * 200

plt.figure(figsize=(8, 6))
plt.scatter(x, y, c=colors, s=sizes, alpha=0.6, cmap='viridis')
plt.colorbar(label='Color value')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter Plot with Colors and Sizes')
plt.show()

In [None]:
# Bar plot
categories = ['A', 'B', 'C', 'D', 'E']
values = [23, 45, 56, 78, 32]

plt.figure(figsize=(8, 5))
bars = plt.bar(categories, values, color='steelblue', edgecolor='black')

# Add value labels on bars
for bar, value in zip(bars, values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
             str(value), ha='center', va='bottom')

plt.xlabel('Category')
plt.ylabel('Value')
plt.title('Bar Chart')
plt.show()

In [None]:
# Histogram
data = np.random.randn(1000)

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.hist(data, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram')

plt.subplot(1, 2, 2)
plt.hist(data, bins=30, density=True, color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Normalized Histogram')

plt.tight_layout()
plt.show()

## 4.2 Subplots and Multiple Plots

In [None]:
# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))

# Plot 1: Line
x = np.linspace(0, 10, 100)
axes[0, 0].plot(x, np.sin(x))
axes[0, 0].set_title('Sine Wave')

# Plot 2: Scatter
axes[0, 1].scatter(np.random.rand(50), np.random.rand(50))
axes[0, 1].set_title('Random Scatter')

# Plot 3: Bar
axes[1, 0].bar(['A', 'B', 'C'], [10, 20, 15])
axes[1, 0].set_title('Bar Chart')

# Plot 4: Pie
axes[1, 1].pie([30, 40, 30], labels=['X', 'Y', 'Z'], autopct='%1.1f%%')
axes[1, 1].set_title('Pie Chart')

plt.tight_layout()
plt.show()

## 4.3 Pandas Plotting

In [None]:
# Create sample data
dates = pd.date_range('2024-01-01', periods=30)
df = pd.DataFrame({
    'A': np.random.randn(30).cumsum(),
    'B': np.random.randn(30).cumsum(),
    'C': np.random.randn(30).cumsum()
}, index=dates)

# Line plot from DataFrame
df.plot(figsize=(10, 5), title='Time Series')
plt.ylabel('Value')
plt.show()

# Other plot types
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

df.plot(ax=axes[0, 0], title='Line Plot')
df.plot(kind='bar', ax=axes[0, 1], title='Bar Plot')
df.iloc[-1].plot(kind='pie', ax=axes[1, 0], title='Pie Chart', autopct='%1.1f%%')
df.plot(kind='box', ax=axes[1, 1], title='Box Plot')

plt.tight_layout()
plt.show()

---
# 5. SEABORN FOR STATISTICAL PLOTS

In [None]:
import seaborn as sns

# Set style
sns.set_style('whitegrid')

# Load sample dataset
tips = sns.load_dataset('tips')
print("Tips dataset:")
print(tips.head())

In [None]:
# Distribution plots
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

sns.histplot(tips['total_bill'], kde=True, ax=axes[0])
axes[0].set_title('Histogram with KDE')

sns.boxplot(x='day', y='total_bill', data=tips, ax=axes[1])
axes[1].set_title('Box Plot by Day')

sns.violinplot(x='day', y='total_bill', data=tips, ax=axes[2])
axes[2].set_title('Violin Plot')

plt.tight_layout()
plt.show()

In [None]:
# Relationship plots
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Scatter with regression line
sns.regplot(x='total_bill', y='tip', data=tips, ax=axes[0])
axes[0].set_title('Scatter with Regression')

# Scatter with hue
sns.scatterplot(x='total_bill', y='tip', hue='smoker', size='size',
                data=tips, ax=axes[1])
axes[1].set_title('Scatter with Hue and Size')

plt.tight_layout()
plt.show()

In [None]:
# Categorical plots
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.countplot(x='day', hue='sex', data=tips, ax=axes[0])
axes[0].set_title('Count Plot')

sns.barplot(x='day', y='total_bill', hue='sex', data=tips, ax=axes[1])
axes[1].set_title('Bar Plot (mean with CI)')

plt.tight_layout()
plt.show()

In [None]:
# Heatmap (correlation matrix)
numeric_cols = tips.select_dtypes(include=[np.number])
correlation = numeric_cols.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Pair plot
sns.pairplot(tips, hue='smoker', diag_kind='kde')
plt.suptitle('Pair Plot', y=1.02)
plt.show()

---
# 6. DATA ANALYSIS WORKFLOW

## 6.1 Complete Example: Analyzing a Dataset

In [None]:
# Load the iris dataset
iris = sns.load_dataset('iris')

# Step 1: Explore the data
print("Dataset shape:", iris.shape)
print("\nFirst few rows:")
print(iris.head())
print("\nData types:")
print(iris.dtypes)
print("\nBasic statistics:")
print(iris.describe())

In [None]:
# Step 2: Check for missing values
print("Missing values:")
print(iris.isnull().sum())

# Step 3: Value counts for categorical
print("\nSpecies distribution:")
print(iris['species'].value_counts())

In [None]:
# Step 4: Visualize distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for i, column in enumerate(iris.columns[:-1]):
    ax = axes[i // 2, i % 2]
    sns.histplot(data=iris, x=column, hue='species', kde=True, ax=ax)
    ax.set_title(f'Distribution of {column}')

plt.tight_layout()
plt.show()

In [None]:
# Step 5: Correlation analysis
numeric_cols = iris.select_dtypes(include=[np.number])
correlation = numeric_cols.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation')
plt.show()

In [None]:
# Step 6: Group analysis
print("\nStatistics by species:")
print(iris.groupby('species').agg(['mean', 'std']))

In [None]:
# Step 7: Relationships between features
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.scatterplot(data=iris, x='sepal_length', y='sepal_width',
                hue='species', ax=axes[0])
axes[0].set_title('Sepal Length vs Width')

sns.scatterplot(data=iris, x='petal_length', y='petal_width',
                hue='species', ax=axes[1])
axes[1].set_title('Petal Length vs Width')

plt.tight_layout()
plt.show()

---
# 7. PRACTICE PROBLEMS

## Problem 1: Sales Data Analysis

In [None]:
# Create sample sales data
np.random.seed(42)
dates = pd.date_range('2023-01-01', periods=365)
products = ['A', 'B', 'C']
regions = ['North', 'South', 'East', 'West']

n_records = 1000
sales_data = pd.DataFrame({
    'Date': np.random.choice(dates, n_records),
    'Product': np.random.choice(products, n_records),
    'Region': np.random.choice(regions, n_records),
    'Quantity': np.random.randint(1, 50, n_records),
    'Price': np.random.uniform(10, 100, n_records).round(2)
})
sales_data['Revenue'] = sales_data['Quantity'] * sales_data['Price']

print("Sales data:")
print(sales_data.head())
print(f"\nShape: {sales_data.shape}")

In [None]:
# Analysis 1: Total revenue by product
revenue_by_product = sales_data.groupby('Product')['Revenue'].sum().sort_values(ascending=False)
print("Total Revenue by Product:")
print(revenue_by_product)

# Visualization
plt.figure(figsize=(8, 5))
revenue_by_product.plot(kind='bar', color='steelblue')
plt.title('Total Revenue by Product')
plt.ylabel('Revenue')
plt.xlabel('Product')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Analysis 2: Monthly revenue trend
sales_data['Month'] = sales_data['Date'].dt.to_period('M')
monthly_revenue = sales_data.groupby('Month')['Revenue'].sum()

plt.figure(figsize=(12, 5))
monthly_revenue.plot(kind='line', marker='o')
plt.title('Monthly Revenue Trend')
plt.ylabel('Revenue')
plt.xlabel('Month')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Analysis 3: Revenue by product and region
pivot = pd.pivot_table(sales_data, values='Revenue',
                       index='Product', columns='Region', aggfunc='sum')
print("Revenue by Product and Region:")
print(pivot)

# Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlOrRd')
plt.title('Revenue Heatmap: Product vs Region')
plt.show()

## Problem 2: Data Cleaning Pipeline

In [None]:
# Create messy data
messy_data = pd.DataFrame({
    'Name': ['Alice', 'BOB', '  charlie  ', 'David', None, 'Eve'],
    'Age': [25, 30, 35, -5, 28, 150],
    'Email': ['alice@email.com', 'bob@', 'charlie@email.com', 'david@email.com', 'eve@email.com', 'eve@email.com'],
    'Salary': [50000, np.nan, 70000, 60000, 55000, 65000]
})

print("Messy data:")
print(messy_data)

# Clean the data
def clean_data(df):
    df = df.copy()
    
    # 1. Clean names
    df['Name'] = df['Name'].str.strip().str.title()
    
    # 2. Handle missing names
    df = df.dropna(subset=['Name'])
    
    # 3. Fix invalid ages
    df.loc[df['Age'] < 0, 'Age'] = np.nan
    df.loc[df['Age'] > 120, 'Age'] = np.nan
    
    # 4. Validate email
    df['Valid_Email'] = df['Email'].str.contains('@.*\.', na=False)
    
    # 5. Fill missing salary with median
    df['Salary'] = df['Salary'].fillna(df['Salary'].median())
    
    return df

clean_df = clean_data(messy_data)
print("\nCleaned data:")
print(clean_df)

---
## Summary

### Key Concepts:

**NumPy:**
- Array creation: zeros, ones, arange, linspace, random
- Indexing: slicing, boolean, fancy indexing
- Operations: element-wise, broadcasting, aggregations
- Linear algebra: dot, inv, eig

**Pandas:**
- Series and DataFrame creation
- Selection: loc, iloc, boolean indexing
- Data manipulation: merge, groupby, pivot_table
- Handling missing data: dropna, fillna
- Apply and transform functions

**Visualization:**
- Matplotlib: line, scatter, bar, histogram
- Subplots and customization
- Seaborn: distribution, categorical, relationship plots
- Heatmaps and pair plots

### Best Practices:

- Use vectorized operations instead of loops
- Always explore data before analysis
- Handle missing values appropriately
- Create meaningful visualizations
- Document your analysis

---

**Next Steps:**
1. Practice with real datasets (Kaggle, UCI)
2. Learn advanced Pandas (time series, multiindex)
3. Explore Plotly for interactive visualizations
4. Move on to Machine Learning with scikit-learn