In [None]:
# Setup: imports and display options
import pandas as pd
import numpy as np
from datetime import datetime

pd.options.display.max_columns = 50
pd.options.display.width = 120
pd.options.display.max_rows = 20

print("Setup complete!")
print(f"pandas version: {pd.__version__}")

## 1. Load Fresh Raw Data

We'll load the original data and perform all cleaning steps:

**Note**: You could also load pre-loaded data from notebook 01 (see commented section at end).

In [None]:
# Load raw data
media_df = pd.read_csv('../data/media_contacts.csv')
demo_df = pd.read_csv('../data/socio_demos.csv')

print(f"Media: {media_df.shape}")
print(f"Demo: {demo_df.shape}")

In [None]:
# Quick inspection
print("Media columns:")
print(media_df.dtypes)

In [None]:
print("\nDemo columns:")
print(demo_df.dtypes)

## 2. Parsing Dates from BIRTHDAY

The `BIRTHDAY` column is stored as a float in YYYYMMDD format (e.g., 19971001.0). We need to convert it to datetime.

### Understanding the Format

In [None]:
# Look at raw birthday values
print("Sample BIRTHDAY values:")
print(demo_df['BIRTHDAY'].head(10))
print(f"\nData type: {demo_df['BIRTHDAY'].dtype}")

### Method 1: Convert to String then Parse

In [None]:
# Convert float to string, remove decimal, then parse
demo_df['birthday_dt'] = pd.to_datetime(
    demo_df['BIRTHDAY'].astype(int).astype(str), 
    format='%Y%m%d'
)

print("Converted birthdays:")
print(demo_df[['BIRTHDAY', 'birthday_dt']].head(10))
print(f"\nNew data type: {demo_df['birthday_dt'].dtype}")

### Extracting Date Components

Once we have datetime, we can extract useful information:

In [None]:
# Extract year, month, age
demo_df['birth_year'] = demo_df['birthday_dt'].dt.year
demo_df['birth_month'] = demo_df['birthday_dt'].dt.month

# Calculate age (as of 2025)
current_year = 2025
demo_df['age'] = current_year - demo_df['birth_year']

print("Age distribution:")
print(demo_df['age'].describe())

In [None]:
# View results
demo_df[['BIRTHDAY', 'birthday_dt', 'birth_year', 'age']].head(10)

## 3. Cleaning "Number_of children"

This column has inconsistent values. Let's normalize it to a clean integer.

In [None]:
# Inspect current values
print("Unique values in Number_of children:")
print(demo_df['Number_of children'].value_counts())

### Standardizing the Values

We need to extract the number and handle inconsistent formatting:

In [None]:
# Create a mapping function
def parse_num_children(value):
    """Extract number of children from inconsistent strings."""
    if pd.isna(value):
        return 0
    
    # Convert to lowercase string
    val_str = str(value).lower().strip()
    
    # Handle common patterns
    if val_str in ['0', 'no children']:
        return 0
    elif val_str in ['1 child', '1child']:
        return 1
    elif val_str in ['2 children', '2children']:
        return 2
    elif val_str in ['3 children', '3children', '3+']:
        return 3
    
    # Try to extract first digit
    import re
    match = re.search(r'\d+', val_str)
    if match:
        return int(match.group())
    
    return 0  # Default

# Apply the function
demo_df['num_children_clean'] = demo_df['Number_of children'].apply(parse_num_children)

print("\nCleaned distribution:")
print(demo_df['num_children_clean'].value_counts().sort_index())

In [None]:
# Compare original vs cleaned
demo_df[['Number_of children', 'num_children_clean']].head(15)

### Alternative: Using `.replace()` with a Dictionary

In [None]:
# Another approach - direct mapping
children_map = {
    '0': 0,
    '1 child': 1,
    '2 Children': 2,
    '3 children': 3,
    # Add more as needed
}

demo_df['num_children_v2'] = demo_df['Number_of children'].replace(children_map)
# Fill any unmapped values with 0
demo_df['num_children_v2'] = demo_df['num_children_v2'].fillna(0).astype(int)

print(demo_df['num_children_v2'].value_counts().sort_index())

## 4. Normalizing "People_in_Household"

This column has variations like "1 -HH (female)", "1-HH (male)", "2-HH", etc.

Let's extract just the household size number:

In [None]:
# Inspect current values
print("Unique household values:")
print(demo_df['People_in_Household'].value_counts())

In [None]:
# Extract the number at the start
demo_df['household_size_clean'] = demo_df['People_in_Household'].str.extract(r'(\d+)', expand=False).astype(int)

print("\nCleaned household sizes:")
print(demo_df['household_size_clean'].value_counts().sort_index())

In [None]:
# Compare
demo_df[['People_in_Household', 'household_size_clean']].head(15)

### Creating Household Type Categories

We can also extract the household type (male/female/general):

In [None]:
# Extract household type
def get_household_type(value):
    """Extract household type from string."""
    val_str = str(value).lower()
    if 'female' in val_str:
        return 'single_female'
    elif 'male' in val_str:
        return 'single_male'
    else:
        return 'multi_person'

demo_df['household_type'] = demo_df['People_in_Household'].apply(get_household_type)

print("\nHousehold types:")
print(demo_df['household_type'].value_counts())

## 5. Handling Missing Values

Let's check for missing values in our media data:

In [None]:
# Check missing values
print("Missing values in media_df:")
print(media_df.isnull().sum())

In [None]:
# Percentage missing
missing_pct = (media_df.isnull().sum() / len(media_df)) * 100
print("\nPercentage missing:")
print(missing_pct[missing_pct > 0])

### Filling Missing Values

Different strategies for different situations:

In [None]:
# For media columns, 0 makes sense (no exposure = 0)
media_cols = ['TV_Total', 'FLYERS', 'Print_Total', 'Online_Video', 
              'Online_Display', 'Online Total', 'TikTok', 'Pinterest']

# Fill missing with 0
for col in media_cols:
    if col in media_df.columns:
        media_df[col] = media_df[col].fillna(0)

print("After filling:")
print(media_df[media_cols].isnull().sum())

### Other Fill Strategies

```python
# Fill with mean
df['column'] = df['column'].fillna(df['column'].mean())

# Fill with median (better for skewed data)
df['column'] = df['column'].fillna(df['column'].median())

# Forward fill (carry last value forward)
df['column'] = df['column'].fillna(method='ffill')

# Backward fill
df['column'] = df['column'].fillna(method='bfill')

# Drop rows with missing values
df_clean = df.dropna()

# Drop rows with missing in specific columns
df_clean = df.dropna(subset=['important_column'])
```

## 6. String Operations

pandas provides powerful string methods through the `.str` accessor:

In [None]:
# Clean gender column
print("Original Gender values:")
print(demo_df['Gender'].value_counts())

# Standardize to lowercase, strip whitespace
demo_df['gender_clean'] = demo_df['Gender'].str.lower().str.strip()

print("\nCleaned:")
print(demo_df['gender_clean'].value_counts())

### Common String Operations

In [None]:
# Examples of string operations
demo_df['gender_upper'] = demo_df['Gender'].str.upper()
demo_df['gender_title'] = demo_df['Gender'].str.title()

# Check if string contains pattern
demo_df['is_female'] = demo_df['gender_clean'] == 'female'

# String length
demo_df['name_length'] = demo_df['People_in_Household'].str.len()

demo_df[['Gender', 'gender_clean', 'gender_upper', 'is_female']].head()

## 7. Using `.assign()` to Create Multiple Columns

`.assign()` is great for creating multiple new columns in one operation:

In [None]:
# Create multiple columns at once
demo_enhanced = demo_df.assign(
    age_group=lambda x: pd.cut(x['age'], bins=[0, 18, 35, 50, 65, 100], 
                                labels=['<18', '18-35', '35-50', '50-65', '65+']),
    has_children=lambda x: x['num_children_clean'] > 0,
    large_household=lambda x: x['household_size_clean'] >= 4
)

print("New columns created:")
print(demo_enhanced[['age', 'age_group', 'num_children_clean', 'has_children', 
                      'household_size_clean', 'large_household']].head(10))

## 8. Data Type Conversions

### Converting to Categorical

For columns with few unique values, `category` dtype saves memory and speeds up operations:

In [None]:
# Check memory before
print("Memory usage before:")
print(f"Gender: {demo_df['Gender'].memory_usage(deep=True) / 1024:.2f} KB")

# Convert to category
demo_df['gender_cat'] = demo_df['gender_clean'].astype('category')

print(f"\nGender (category): {demo_df['gender_cat'].memory_usage(deep=True) / 1024:.2f} KB")
print(f"Memory saved: {(demo_df['Gender'].memory_usage(deep=True) - demo_df['gender_cat'].memory_usage(deep=True)) / 1024:.2f} KB")

In [None]:
# Check dtypes
print("\nOriginal dtype:", demo_df['Gender'].dtype)
print("Category dtype:", demo_df['gender_cat'].dtype)
print("Categories:", demo_df['gender_cat'].cat.categories)

### Other Type Conversions

In [None]:
# Float to integer (must have no NaNs)
demo_df['weight_int'] = demo_df['weight'].fillna(0).astype(int)

# Integer to float
demo_df['age_float'] = demo_df['age'].astype(float)

# String to numeric (coerce errors to NaN)
# demo_df['numeric_col'] = pd.to_numeric(demo_df['string_col'], errors='coerce')

print("Type conversions:")
print(demo_df[['weight', 'weight_int', 'age', 'age_float']].dtypes)

## 9. Renaming Columns

Clean up column names for easier coding:

In [None]:
# Standardize media column names
media_df.columns = media_df.columns.str.lower().str.replace(' ', '_')

print("Standardized media columns:")
print(media_df.columns.tolist())

In [None]:
# Rename specific columns
demo_renamed = demo_df.rename(columns={
    'Person ID': 'person_id',
    'Number_of children': 'num_children_orig',
    'People_in_Household': 'household_orig',
    'Gender': 'gender_orig'
})

print("\nRenamed demo columns:")
print(demo_renamed.columns.tolist())

## 10. Building a Cleaning Pipeline

Let's combine everything into a reusable function:

In [None]:
def clean_demographic_data(df):
    """
    Clean demographic data with all transformations.
    
    Parameters:
    -----------
    df : DataFrame
        Raw demographic data
    
    Returns:
    --------
    DataFrame
        Cleaned demographic data
    """
    # Copy to avoid modifying original
    df_clean = df.copy()
    
    # Rename columns
    df_clean = df_clean.rename(columns={
        'Person ID': 'person_id',
        'Number_of children': 'num_children_orig',
        'People_in_Household': 'household_orig'
    })
    
    # Parse birthday
    df_clean['birthday_dt'] = pd.to_datetime(
        df_clean['BIRTHDAY'].astype(int).astype(str), 
        format='%Y%m%d'
    )
    df_clean['age'] = 2025 - df_clean['birthday_dt'].dt.year
    
    # Clean number of children
    def parse_children(val):
        if pd.isna(val):
            return 0
        import re
        match = re.search(r'\d+', str(val))
        return int(match.group()) if match else 0
    
    df_clean['num_children'] = df_clean['num_children_orig'].apply(parse_children)
    
    # Extract household size
    df_clean['household_size'] = df_clean['household_orig'].str.extract(r'(\d+)')[0].astype(int)
    
    # Clean gender
    df_clean['gender'] = df_clean['Gender'].str.lower().str.strip().astype('category')
    
    # Create derived columns
    df_clean['age_group'] = pd.cut(
        df_clean['age'], 
        bins=[0, 18, 35, 50, 65, 100],
        labels=['<18', '18-35', '35-50', '50-65', '65+']
    )
    df_clean['has_children'] = df_clean['num_children'] > 0
    
    return df_clean

# Test the pipeline
demo_clean = clean_demographic_data(demo_df)
print("Cleaned demographic data:")
print(demo_clean.info())

In [None]:
# View cleaned data
demo_clean[['person_id', 'age', 'gender', 'num_children', 
            'household_size', 'age_group', 'has_children']].head(10)

## Summary

In this notebook, you learned:

âœ… Parse dates from various formats using `pd.to_datetime()`  
âœ… Extract date components (year, month, age) with `.dt` accessor  
âœ… Clean text columns with string operations (`.str.lower()`, `.str.strip()`, `.str.extract()`)  
âœ… Normalize inconsistent categorical data  
âœ… Handle missing values with different strategies (`.fillna()`, `.dropna()`)  
âœ… Create new columns with `.assign()` and lambda functions  
âœ… Convert data types with `.astype()` and `pd.to_numeric()`  
âœ… Use categorical dtype for memory efficiency  
âœ… Build reusable cleaning pipelines with functions  
âœ… Rename columns for consistency

### Key Takeaways

1. **Always inspect data first**: Use `.value_counts()`, `.unique()`, `.dtypes` before cleaning
2. **Handle missing values appropriately**: Different strategies for different situations
3. **Use categories for repeated strings**: Saves memory and speeds up operations
4. **Build reusable functions**: Create cleaning pipelines you can apply consistently
5. **Document your transformations**: Comment your code so others understand the logic

### Next Steps

In the next notebook (**04_eda.ipynb** or similar), we'll:
- Explore cleaned data with statistics and visualizations
- Create summary tables and crosstabs
- Generate insights from the data
- Build compelling visualizations

## ðŸŽ¯ Practice Exercises

Try these on your own:

1. Create an "age_decade" column (e.g., "20s", "30s", "40s")
2. Extract the household gender type (single_male, single_female, multi) from `People_in_Household`
3. Create a "high_weight" boolean column for weight > median weight
4. Bin the age column into quartiles (4 equal groups)
5. Find and fix any remaining missing values in the media dataset
6. Create a "total_media_exposure" column summing all media columns
7. Convert all media column names to follow `snake_case` convention
8. Create age bins that make sense for your analysis

### Bonus Challenges

9. Write a function to clean the media dataset (similar to the demo cleaning function)
10. Create a "generation" column (Gen Z, Millennial, Gen X, Boomer) based on birth year
11. Detect and handle outliers in the weight column (values > 3 standard deviations from mean)
12. Create a "media_diversity" score counting how many different media channels each person uses

## Loading/Saving Data Between Notebooks

### Load Data from Previous Notebook

If you saved data from notebook 02:

```python
# Uncomment to load previously selected data
# media_df = pd.read_csv('../outputs/media_selected.csv')
# demo_df = pd.read_csv('../outputs/demo_selected.csv')
```

### Save Cleaned Data for Next Notebook

Save your cleaned datasets:

```python
# Uncomment to save cleaned data
# demo_clean.to_csv('../outputs/demo_cleaned.csv', index=False)
# media_df.to_csv('../outputs/media_cleaned.csv', index=False)

# Or save in Parquet format (faster, smaller)
# demo_clean.to_parquet('../outputs/demo_cleaned.parquet')
# media_df.to_parquet('../outputs/media_cleaned.parquet')
```

**Note**: Create the `../outputs/` directory first if it doesn't exist!

```python
# Create outputs directory
import os
os.makedirs('../outputs', exist_ok=True)
```