In [None]:
# Setup: imports and display options
import pandas as pd
import numpy as np
import os

pd.options.display.max_columns = 50
pd.options.display.width = 120
pd.options.display.max_rows = 20

print("Setup complete!")
print(f"pandas version: {pd.__version__}")

## 1. Load and Prepare Data

Let's recreate our cleaned, merged dataset:

In [None]:
# Load raw data
media_df = pd.read_csv('../data/media_contacts.csv')
demo_df = pd.read_csv('../data/socio_demos.csv')

# Standardize column names
media_df.columns = media_df.columns.str.strip().str.lower().str.replace(' ', '_')
demo_df.columns = demo_df.columns.str.strip().str.lower().str.replace(' ', '_')

print(f"Media: {media_df.shape}")
print(f"Demo: {demo_df.shape}")

In [None]:
# Merge datasets
merged_df = pd.merge(media_df, demo_df, on='person_id', how='inner')

# Basic cleaning
merged_df['birthday_dt'] = pd.to_datetime(
    merged_df['birthday'].astype(int).astype(str), 
    format='%Y%m%d',
    errors='coerce'
)
merged_df['age'] = 2025 - merged_df['birthday_dt'].dt.year

# Create age bands
merged_df['age_band'] = pd.cut(
    merged_df['age'],
    bins=[0, 18, 25, 35, 45, 55, 65, 100],
    labels=['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '65+']
)

print(f"Merged and cleaned: {merged_df.shape}")
merged_df.head()

## 2. Create Output Directory

Always create your output directory first:

In [None]:
# Create outputs directory
output_dir = '../outputs'
os.makedirs(output_dir, exist_ok=True)

print(f"Output directory: {os.path.abspath(output_dir)}")
print(f"Directory exists: {os.path.exists(output_dir)}")

## 3. Export to CSV

CSV is the most universal format - works everywhere.

### Basic CSV Export

In [None]:
# Basic CSV export
csv_path = f'{output_dir}/merged_data.csv'
merged_df.to_csv(csv_path, index=False)

print(f"Saved to: {csv_path}")
print(f"File size: {os.path.getsize(csv_path) / 1024 / 1024:.2f} MB")

In [None]:
# Verify we can load it back
verify_df = pd.read_csv(csv_path)

print(f"\nLoaded back: {verify_df.shape}")
print(f"Columns match: {list(merged_df.columns) == list(verify_df.columns)}")
print(f"Data types preserved: {(merged_df.dtypes == verify_df.dtypes).sum()} / {len(merged_df.dtypes)}")

### CSV with Compression

Compress CSVs to save space:

In [None]:
# Export with gzip compression
csv_gz_path = f'{output_dir}/merged_data.csv.gz'
merged_df.to_csv(csv_gz_path, index=False, compression='gzip')

print(f"Saved compressed CSV to: {csv_gz_path}")
print(f"Original CSV: {os.path.getsize(csv_path) / 1024 / 1024:.2f} MB")
print(f"Compressed CSV: {os.path.getsize(csv_gz_path) / 1024 / 1024:.2f} MB")
print(f"Compression ratio: {os.path.getsize(csv_gz_path) / os.path.getsize(csv_path):.1%}")

In [None]:
# pandas automatically handles gzip when reading
verify_gz = pd.read_csv(csv_gz_path)

print(f"\nLoaded compressed CSV: {verify_gz.shape}")
print("âœ… pandas automatically decompresses .csv.gz files!")

### CSV Encoding and Special Characters

In [None]:
# UTF-8 encoding (default, best practice)
merged_df.to_csv(f'{output_dir}/data_utf8.csv', index=False, encoding='utf-8')

# UTF-8 with BOM (for Excel compatibility)
merged_df.to_csv(f'{output_dir}/data_utf8_bom.csv', index=False, encoding='utf-8-sig')

print("Saved with different encodings:")
print("  - UTF-8: Best for most uses")
print("  - UTF-8-BOM: Best for opening in Excel")

## 4. Export to Parquet

Parquet is a columnar format: faster to read, smaller size, preserves dtypes.

### When to Use Parquet:
âœ… Large datasets (>100MB)  
âœ… Internal pipelines (Python to Python)  
âœ… Need to preserve exact dtypes  
âœ… Speed is important  

### When to Use CSV:
âœ… Small datasets  
âœ… Sharing with non-technical users  
âœ… Need to open in Excel  
âœ… Maximum compatibility  

In [None]:
# Export to Parquet
parquet_path = f'{output_dir}/merged_data.parquet'
merged_df.to_parquet(parquet_path, index=False, compression='snappy')

print(f"Saved to Parquet: {parquet_path}")
print(f"\nFile size comparison:")
print(f"  CSV:         {os.path.getsize(csv_path) / 1024 / 1024:.2f} MB")
print(f"  CSV.GZ:      {os.path.getsize(csv_gz_path) / 1024 / 1024:.2f} MB")
print(f"  Parquet:     {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB")

In [None]:
# Load Parquet
verify_parquet = pd.read_parquet(parquet_path)

print(f"\nLoaded from Parquet: {verify_parquet.shape}")
print(f"\nData types preserved:")
print(f"Original dtypes == Parquet dtypes: {(merged_df.dtypes == verify_parquet.dtypes).all()}")

# Show datetime was preserved
print(f"\nbirthday_dt preserved as datetime:")
print(f"  Original: {merged_df['birthday_dt'].dtype}")
print(f"  Parquet:  {verify_parquet['birthday_dt'].dtype}")

In [None]:
# Parquet compression options
compression_types = ['snappy', 'gzip', 'brotli']

for comp in compression_types:
    path = f'{output_dir}/merged_{comp}.parquet'
    merged_df.to_parquet(path, index=False, compression=comp)
    size_mb = os.path.getsize(path) / 1024 / 1024
    print(f"{comp:10s}: {size_mb:.2f} MB")

print("\nRecommendation: Use 'snappy' for balanced speed/compression")

## 5. Export Subsets of Data

Often you need to save filtered data:

In [None]:
# Export by gender
males_df = merged_df[merged_df['gender'] == 'Male']
females_df = merged_df[merged_df['gender'] == 'Female']

males_df.to_csv(f'{output_dir}/males.csv', index=False)
females_df.to_csv(f'{output_dir}/females.csv', index=False)

print(f"Saved gender subsets:")
print(f"  Males:   {len(males_df):,} rows")
print(f"  Females: {len(females_df):,} rows")

In [None]:
# Export by age band
for age_band in merged_df['age_band'].cat.categories:
    subset = merged_df[merged_df['age_band'] == age_band]
    filename = f'{output_dir}/age_{age_band}.csv'.replace('<', 'under').replace('+', 'plus')
    subset.to_csv(filename, index=False)
    print(f"Saved {age_band:8s}: {len(subset):,} rows -> {filename}")

In [None]:
# Export purchasers only
purchasers_df = merged_df[merged_df['purchase'] == 1]
purchasers_df.to_csv(f'{output_dir}/purchasers_only.csv', index=False)

print(f"\nPurchasers only: {len(purchasers_df):,} rows")
print(f"Purchase rate: {len(purchasers_df) / len(merged_df):.2%}")

## 6. Export Summary Tables

Save aggregated analysis results:

In [None]:
# Create summary table
summary_table = merged_df.groupby(['gender', 'age_band'], observed=True).agg(
    sample_size=('person_id', 'count'),
    purchases=('purchase', 'sum'),
    purchase_rate=('purchase', 'mean'),
    avg_tv=('tv_total', 'mean'),
    avg_online=('online_total', 'mean'),
    avg_print=('print_total', 'mean')
).round(4)

print("Summary table:")
print(summary_table.head(10))

In [None]:
# Save summary table
summary_table.to_csv(f'{output_dir}/purchase_summary.csv')

print(f"Saved summary table with index")
print(f"Rows: {len(summary_table)}")

In [None]:
# Reset index before saving (sometimes cleaner)
summary_table_reset = summary_table.reset_index()
summary_table_reset.to_csv(f'{output_dir}/purchase_summary_flat.csv', index=False)

print("\nSaved flattened summary table (no index)")
print(summary_table_reset.head())

## 7. Export to Excel

Export to Excel with multiple sheets:

In [None]:
# Single sheet Excel export
excel_path = f'{output_dir}/merged_data.xlsx'
merged_df.to_excel(excel_path, index=False, sheet_name='Data')

print(f"Saved to Excel: {excel_path}")
print(f"File size: {os.path.getsize(excel_path) / 1024 / 1024:.2f} MB")

In [None]:
# Multiple sheets in one Excel file
with pd.ExcelWriter(f'{output_dir}/analysis_report.xlsx', engine='openpyxl') as writer:
    # Main data
    merged_df.head(1000).to_excel(writer, sheet_name='Sample Data', index=False)
    
    # Summary table
    summary_table_reset.to_excel(writer, sheet_name='Purchase Summary', index=False)
    
    # Gender breakdown
    gender_summary = merged_df.groupby('gender').agg({
        'person_id': 'count',
        'purchase': ['sum', 'mean'],
        'tv_total': 'mean'
    })
    gender_summary.to_excel(writer, sheet_name='Gender Summary')
    
    # Age breakdown
    age_summary = merged_df.groupby('age_band', observed=True).agg({
        'person_id': 'count',
        'purchase': 'mean'
    })
    age_summary.to_excel(writer, sheet_name='Age Summary')

print("\nSaved multi-sheet Excel workbook:")
print("  - Sample Data (first 1000 rows)")
print("  - Purchase Summary")
print("  - Gender Summary")
print("  - Age Summary")

## 8. Export Selected Columns

Save only the columns you need:

In [None]:
# Select key columns for sharing
key_columns = [
    'person_id', 'gender', 'age', 'age_band',
    'tv_total', 'online_total', 'print_total',
    'purchase', 'weight'
]

minimal_df = merged_df[key_columns]

minimal_df.to_csv(f'{output_dir}/minimal_dataset.csv', index=False)

print(f"Saved minimal dataset:")
print(f"  Original columns: {len(merged_df.columns)}")
print(f"  Minimal columns:  {len(minimal_df.columns)}")
print(f"  Original size: {os.path.getsize(csv_path) / 1024 / 1024:.2f} MB")
print(f"  Minimal size:  {os.path.getsize(f'{output_dir}/minimal_dataset.csv') / 1024 / 1024:.2f} MB")

## 9. Format Selection Guide

Choose the right format for your use case:

| Format | Best For | Pros | Cons |
|--------|----------|------|------|
| **CSV** | Sharing, Excel, small files | Universal, human-readable | Large size, loses dtypes |
| **CSV.GZ** | Archival, medium files | Compressed, universal | Slower to read |
| **Parquet** | Pipelines, large files, Python | Fast, small, preserves types | Not human-readable |
| **Excel** | Reports, business users | Multiple sheets, formatting | Slow, size limits |
| **Pickle** | Python only, exact state | Preserves everything | Python-specific, security risk |

In [None]:
# Demonstrate format comparison
formats = {
    'CSV': (f'{output_dir}/format_test.csv', lambda df, path: df.to_csv(path, index=False)),
    'CSV.GZ': (f'{output_dir}/format_test.csv.gz', lambda df, path: df.to_csv(path, index=False, compression='gzip')),
    'Parquet': (f'{output_dir}/format_test.parquet', lambda df, path: df.to_parquet(path, index=False)),
    'Excel': (f'{output_dir}/format_test.xlsx', lambda df, path: df.to_excel(path, index=False)),
    'Pickle': (f'{output_dir}/format_test.pkl', lambda df, path: df.to_pickle(path))
}

test_df = merged_df.head(5000)  # Use subset for speed

print("Format comparison (5,000 rows):\n")
print(f"{'Format':<12} {'Size (MB)':<12} {'Write Time':<15} {'Read Time'}")
print("-" * 60)

import time

for format_name, (path, save_func) in formats.items():
    # Write
    start = time.time()
    save_func(test_df, path)
    write_time = time.time() - start
    
    # Get size
    size_mb = os.path.getsize(path) / 1024 / 1024
    
    # Read
    start = time.time()
    if format_name == 'CSV' or format_name == 'CSV.GZ':
        _ = pd.read_csv(path)
    elif format_name == 'Parquet':
        _ = pd.read_parquet(path)
    elif format_name == 'Excel':
        _ = pd.read_excel(path)
    elif format_name == 'Pickle':
        _ = pd.read_pickle(path)
    read_time = time.time() - start
    
    print(f"{format_name:<12} {size_mb:<12.2f} {write_time:<15.3f} {read_time:.3f}s")

## 10. Best Practices

### File Naming

In [None]:
# Include date in filename
from datetime import datetime

date_str = datetime.now().strftime('%Y%m%d')
dated_path = f'{output_dir}/merged_data_{date_str}.csv'

merged_df.to_csv(dated_path, index=False)
print(f"Saved with date: {dated_path}")

In [None]:
# Include metadata in filename
version = 'v2'
filter_type = 'all_ages'
metric_path = f'{output_dir}/analysis_{version}_{filter_type}_{date_str}.csv'

summary_table_reset.to_csv(metric_path, index=False)
print(f"Saved with metadata: {metric_path}")

### Documentation

In [None]:
# Create a data dictionary
data_dict = pd.DataFrame({
    'column_name': merged_df.columns,
    'dtype': merged_df.dtypes.astype(str),
    'non_null_count': merged_df.notnull().sum(),
    'null_count': merged_df.isnull().sum(),
    'unique_values': [merged_df[col].nunique() for col in merged_df.columns],
    'sample_value': [str(merged_df[col].iloc[0]) if len(merged_df) > 0 else '' for col in merged_df.columns]
})

data_dict.to_csv(f'{output_dir}/data_dictionary.csv', index=False)

print("Saved data dictionary:")
print(data_dict.head(10))

In [None]:
# Create README file
readme_content = f"""# Data Export Summary

**Export Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
**Source Data**: media_contacts.csv, socio_demos.csv
**Total Rows**: {len(merged_df):,}
**Total Columns**: {len(merged_df.columns)}

## Files Included

1. **merged_data.csv** - Full merged dataset
2. **merged_data.parquet** - Parquet format (faster)
3. **merged_data.csv.gz** - Compressed CSV
4. **males.csv** - Male respondents only
5. **females.csv** - Female respondents only
6. **purchase_summary.csv** - Aggregated purchase statistics
7. **data_dictionary.csv** - Column descriptions

## Column Descriptions

- person_id: Unique identifier
- gender: Male/Female
- age: Age in years
- age_band: Age category
- tv_total: Total TV exposure minutes
- online_total: Total online exposure minutes
- print_total: Total print exposure minutes
- purchase: Purchase indicator (0/1)
- weight: Survey weight

## Notes

- Data cleaned and validated
- Missing values handled
- Categorical variables created
- Ready for analysis
"""

with open(f'{output_dir}/README.txt', 'w') as f:
    f.write(readme_content)

print("Saved README.txt with documentation")

## 11. Reproducible Pipeline

Create a complete save/load pipeline:

In [None]:
# Complete export function
def export_analysis_results(df, output_dir='../outputs', prefix='analysis'):
    """
    Export analysis results in multiple formats with documentation.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Data to export
    output_dir : str
        Output directory path
    prefix : str
        Filename prefix
    """
    os.makedirs(output_dir, exist_ok=True)
    date_str = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Export in multiple formats
    base_path = f'{output_dir}/{prefix}_{date_str}'
    
    # CSV
    df.to_csv(f'{base_path}.csv', index=False)
    print(f"âœ… Saved CSV: {base_path}.csv")
    
    # Compressed CSV
    df.to_csv(f'{base_path}.csv.gz', index=False, compression='gzip')
    print(f"âœ… Saved CSV.GZ: {base_path}.csv.gz")
    
    # Parquet
    df.to_parquet(f'{base_path}.parquet', index=False)
    print(f"âœ… Saved Parquet: {base_path}.parquet")
    
    # Data dictionary
    data_dict = pd.DataFrame({
        'column': df.columns,
        'dtype': df.dtypes.astype(str),
        'non_null': df.notnull().sum(),
        'unique': [df[col].nunique() for col in df.columns]
    })
    data_dict.to_csv(f'{base_path}_dictionary.csv', index=False)
    print(f"âœ… Saved data dictionary: {base_path}_dictionary.csv")
    
    # Summary stats
    summary = pd.DataFrame({
        'metric': ['rows', 'columns', 'memory_mb'],
        'value': [len(df), len(df.columns), df.memory_usage(deep=True).sum() / 1024 / 1024]
    })
    summary.to_csv(f'{base_path}_summary.csv', index=False)
    print(f"âœ… Saved summary: {base_path}_summary.csv")
    
    print(f"\nðŸŽ‰ Export complete! Files saved to: {output_dir}")
    return base_path

# Use the function
export_path = export_analysis_results(merged_df, output_dir, 'merged_data')

## 12. Loading Data Back

Best practices for loading saved data:

In [None]:
# Load CSV
loaded_csv = pd.read_csv(f'{output_dir}/merged_data.csv')

# Specify dtypes for better performance
dtype_spec = {
    'person_id': 'int64',
    'gender': 'category',
    'purchase': 'int8',
}

loaded_csv_typed = pd.read_csv(
    f'{output_dir}/merged_data.csv',
    dtype=dtype_spec,
    parse_dates=['birthday_dt']
)

print("Loaded CSV with specified dtypes:")
print(loaded_csv_typed.dtypes)

In [None]:
# Load Parquet (dtypes preserved automatically)
loaded_parquet = pd.read_parquet(f'{output_dir}/merged_data.parquet')

print("\nLoaded Parquet (dtypes auto-preserved):")
print(loaded_parquet.dtypes)
print(f"\nData identical: {loaded_parquet.equals(merged_df)}")

## Summary

In this notebook, you learned:

âœ… Export to CSV with compression  
âœ… Save to Parquet for performance  
âœ… Choose the right file format  
âœ… Export to Excel with multiple sheets  
âœ… Save subsets and filtered data  
âœ… Create data dictionaries and documentation  
âœ… Build reproducible export pipelines  
âœ… Follow best practices for file naming  
âœ… Load data back efficiently  

### Key Takeaways

1. **CSV for sharing**, Parquet for pipelines
2. **Compress large CSVs** with gzip
3. **Document your exports** with data dictionaries
4. **Use date stamps** in filenames
5. **Preserve dtypes** with Parquet or dtype specs
6. **Test your exports** by loading them back
7. **Create reusable functions** for consistent exports

### Congratulations! ðŸŽ‰

You've completed all 7 notebooks and learned:

1. **Data Loading** - Read CSVs, inspect DataFrames
2. **Selection & Indexing** - Filter and subset data
3. **Cleaning & Transformations** - Handle messy data
4. **Merging & Joining** - Combine datasets
5. **GroupBy & Aggregation** - Summarize data
6. **Reshaping & Pivoting** - Change data structure
7. **Exporting & Saving** - Save results

You now have the skills to:
- Load and clean real-world data
- Transform and merge multiple datasets
- Perform complex aggregations
- Reshape data for analysis
- Export results professionally

**Next Steps**: Apply these skills to your own data projects! ðŸš€

## ðŸŽ¯ Practice Exercises

Try these on your own:

1. Export only purchasers to a compressed CSV
2. Save gender summary stats to Excel with formatting
3. Create separate Parquet files for each age band
4. Export top 10 most active users (by total media exposure) to CSV
5. Create a multi-sheet Excel report with data + 3 summary tables
6. Save a data dictionary for the minimal_dataset
7. Create a function that exports a DataFrame with automatic compression based on size
8. Load a CSV and validate it matches the original DataFrame

### Bonus Challenges

9. Create a complete export pipeline that saves data, summaries, and visualizations
10. Write a function that compares file sizes across all formats and recommends the best one
11. Create dated backup files (keep last 7 days, delete older)
12. Export data with custom column order and subset of rows based on a filter

### Ultimate Challenge

Build a complete data processing script that:
1. Loads raw data
2. Cleans and transforms it
3. Creates multiple analyses
4. Exports everything to organized folders
5. Creates a comprehensive README
6. Includes error handling and logging

## Loading/Saving Data Between Notebooks

### Load Previously Processed Data

```python
# Uncomment to load from previous notebooks
# merged_df = pd.read_csv('../outputs/merged_data.csv')
# 
# # Or load from Parquet (faster, preserves types)
# merged_df = pd.read_parquet('../outputs/merged_data.parquet')
#
# # Load summary tables
# summary_df = pd.read_csv('../outputs/purchase_summary_flat.csv')
#
# print(f"Loaded data: {merged_df.shape}")
```

### Final Export

```python
# This notebook IS about exporting!
# All export code is in the cells above.
# 
# Recommended final exports:
# 1. Full merged dataset -> Parquet (fast access)
# 2. Full merged dataset -> CSV.GZ (universal backup)
# 3. Summary tables -> CSV (for reporting)
# 4. Analysis report -> Excel (for stakeholders)
# 5. Data dictionary -> CSV (for documentation)
```

### Archive Your Work

```python
# Uncomment to create a dated archive
# import shutil
# from datetime import datetime
# 
# date_str = datetime.now().strftime('%Y%m%d')
# archive_dir = f'../outputs/archive_{date_str}'
# 
# # Copy outputs to archive
# shutil.copytree(output_dir, archive_dir, dirs_exist_ok=True)
# 
# print(f"âœ… Archived outputs to: {archive_dir}")
```