# ðŸ“Š Data Processing Template

This notebook demonstrates common data processing tasks:
- Loading CSV/Excel files
- Data cleaning and transformation
- Basic analysis
- Exporting processed data

Customize this template for your own data processing needs!

## 1. Setup and Imports

In [None]:
# Standard imports
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

# Data processing
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Our modules
from src.config import settings
from src.utils import get_timestamp, Timer

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("âœ… Imports successful!")
print(f"Timestamp: {get_timestamp()}")

## 2. Create Sample Data

For demonstration, let's create some sample data. Replace this with loading your actual data.

In [None]:
# Create sample dataset
np.random.seed(42)

sample_data = pd.DataFrame({
    'id': range(1, 101),
    'name': [f'User_{i}' for i in range(1, 101)],
    'age': np.random.randint(18, 70, 100),
    'score': np.random.normal(75, 15, 100).round(1),
    'category': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'active': np.random.choice([True, False], 100),
    'created_at': pd.date_range('2024-01-01', periods=100, freq='D')
})

# Add some missing values
sample_data.loc[5:10, 'score'] = np.nan
sample_data.loc[15:20, 'category'] = np.nan

print(f"Created sample dataset with {len(sample_data)} rows")
sample_data.head()

## 3. Load Your Data

Uncomment and modify the code below to load your actual data.

In [None]:
# Option 1: Load CSV file
# df = pd.read_csv(settings.RAW_DATA_DIR / 'your_file.csv')

# Option 2: Load Excel file
# df = pd.read_excel(settings.RAW_DATA_DIR / 'your_file.xlsx', sheet_name='Sheet1')

# Option 3: Load from URL
# df = pd.read_csv('https://example.com/data.csv')

# For this demo, use sample data
df = sample_data.copy()

print(f"âœ… Data loaded: {len(df)} rows, {len(df.columns)} columns")

## 4. Explore Data

In [None]:
# Show first few rows
print("First 5 rows:")
display(df.head())

# Basic info
print("\nDataset Info:")
print(f"  Rows: {len(df)}")
print(f"  Columns: {len(df.columns)}")
print(f"  Memory: {df.memory_usage().sum() / 1024:.1f} KB")

In [None]:
# Column types and missing values
print("Column Information:")
info_df = pd.DataFrame({
    'Type': df.dtypes,
    'Non-Null': df.count(),
    'Null': df.isnull().sum(),
    'Null %': (df.isnull().sum() / len(df) * 100).round(1)
})
display(info_df)

In [None]:
# Statistical summary
print("Statistical Summary:")
display(df.describe())

## 5. Data Cleaning

In [None]:
# Make a copy for cleaning
df_clean = df.copy()

print("Before cleaning:")
print(f"  Rows: {len(df_clean)}")
print(f"  Missing values: {df_clean.isnull().sum().sum()}")

# Handle missing values
# Option 1: Fill with mean/median
df_clean['score'].fillna(df_clean['score'].median(), inplace=True)

# Option 2: Fill with mode (most common value)
df_clean['category'].fillna(df_clean['category'].mode()[0], inplace=True)

# Option 3: Drop rows with any missing values
# df_clean.dropna(inplace=True)

print("\nAfter cleaning:")
print(f"  Rows: {len(df_clean)}")
print(f"  Missing values: {df_clean.isnull().sum().sum()}")

## 6. Data Transformation

In [None]:
# Add new columns
df_clean['score_category'] = pd.cut(
    df_clean['score'], 
    bins=[0, 60, 75, 90, 100],
    labels=['Low', 'Medium', 'High', 'Excellent']
)

df_clean['age_group'] = pd.cut(
    df_clean['age'],
    bins=[0, 25, 40, 60, 100],
    labels=['Young', 'Adult', 'Middle-aged', 'Senior']
)

# Calculate derived fields
df_clean['score_normalized'] = (df_clean['score'] - df_clean['score'].min()) / (df_clean['score'].max() - df_clean['score'].min())

print("âœ… New columns added:")
print(df_clean[['score', 'score_category', 'score_normalized', 'age', 'age_group']].head())

## 7. Analysis & Visualization

In [None]:
# Group by analysis
print("Average score by category:")
category_stats = df_clean.groupby('category')['score'].agg(['mean', 'std', 'count']).round(2)
display(category_stats)

In [None]:
# Visualization 1: Distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Score distribution
axes[0].hist(df_clean['score'], bins=20, edgecolor='black', alpha=0.7)
axes[0].set_title('Score Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Score')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df_clean['score'].mean(), color='red', linestyle='--', label='Mean')
axes[0].legend()

# Age distribution
axes[1].hist(df_clean['age'], bins=15, edgecolor='black', alpha=0.7, color='green')
axes[1].set_title('Age Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Visualization 2: Category comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
df_clean.boxplot(column='score', by='category', ax=axes[0])
axes[0].set_title('Score by Category (Box Plot)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Score')
plt.suptitle('')  # Remove automatic title

# Bar plot
category_counts = df_clean['category'].value_counts()
category_counts.plot(kind='bar', ax=axes[1], color='skyblue', edgecolor='black')
axes[1].set_title('Category Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

## 8. Save Processed Data

In [None]:
# Ensure output directory exists
settings.PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# Save to CSV
output_file = settings.PROCESSED_DATA_DIR / f'processed_data_{get_timestamp().replace(":", "-").replace(" ", "_")}.csv'
df_clean.to_csv(output_file, index=False)

print(f"âœ… Data saved to: {output_file}")
print(f"   Rows: {len(df_clean)}")
print(f"   Columns: {len(df_clean.columns)}")
print(f"   Size: {output_file.stat().st_size / 1024:.1f} KB")

## 9. Summary Report

In [None]:
print("=" * 60)
print("ðŸ“Š DATA PROCESSING SUMMARY")
print("=" * 60)
print(f"\nTimestamp: {get_timestamp()}")
print(f"\nOriginal Data:")
print(f"  â€¢ Rows: {len(df)}")
print(f"  â€¢ Columns: {len(df.columns)}")
print(f"  â€¢ Missing values: {df.isnull().sum().sum()}")
print(f"\nProcessed Data:")
print(f"  â€¢ Rows: {len(df_clean)}")
print(f"  â€¢ Columns: {len(df_clean.columns)}")
print(f"  â€¢ Missing values: {df_clean.isnull().sum().sum()}")
print(f"\nKey Statistics:")
print(f"  â€¢ Average score: {df_clean['score'].mean():.2f}")
print(f"  â€¢ Average age: {df_clean['age'].mean():.1f}")
print(f"  â€¢ Active users: {df_clean['active'].sum()} ({df_clean['active'].sum()/len(df_clean)*100:.1f}%)")
print(f"\nOutput: {output_file}")
print("\n" + "=" * 60)
print("âœ… Processing complete!")
print("=" * 60)

## ðŸ’¡ Next Steps

1. **Replace sample data** with your actual dataset
2. **Customize cleaning** logic for your needs
3. **Add analysis** specific to your domain
4. **Create visualizations** that tell your story
5. **Save results** in formats you need (CSV, Excel, JSON)

Happy data processing! ðŸŽ‰