In [3]:
import pandas as pd
import os

def clean_missing_values(input_path, output_path):
    """
    Clean the dataset by filling missing values according to specified rules.
    
    Parameters:
    input_path (str): Path to the raw CSV file
    output_path (str): Path to save the cleaned CSV file
    """
    # Read the raw data
    df = pd.read_csv(input_path)
    
    print("="*60)
    print("DATA CLEANING PROCESS")
    print("="*60)
    print(f"\nOriginal dataset shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
    
    # Display missing values before cleaning
    print("\nMissing values BEFORE cleaning:")
    missing_before = df.isnull().sum()
    missing_cols = missing_before[missing_before > 0]
    if len(missing_cols) > 0:
        for col, count in missing_cols.items():
            print(f"  {col}: {count} missing values")
    
    print("\n" + "-"*60)
    print("APPLYING CLEANING RULES:")
    print("-"*60)
    
    # 1. Fill missing age values with mode
    if df['age'].isnull().sum() > 0:
        age_mode = df['age'].mode()[0]
        missing_age_count = df['age'].isnull().sum()
        df['age'].fillna(age_mode, inplace=True)
        print(f"‚úì Filled {missing_age_count} missing 'age' values with mode: {age_mode}")
    else:
        print("‚úì No missing values in 'age' column")
    
    # 2. Fill missing ethnicity values with "unknown"
    if df['ethnicity'].isnull().sum() > 0:
        missing_ethnicity_count = df['ethnicity'].isnull().sum()
        df['ethnicity'].fillna('unknown', inplace=True)
        print(f"‚úì Filled {missing_ethnicity_count} missing 'ethnicity' values with 'unknown'")
    else:
        print("‚úì No missing values in 'ethnicity' column")
    
    # 3. Standardize relation column: convert "self" to "Self"
    if 'relation' in df.columns:
        # Count occurrences before standardization
        self_lowercase_count = (df['relation'] == 'self').sum()
        if self_lowercase_count > 0:
            df['relation'] = df['relation'].replace('self', 'Self')
            print(f"‚úì Standardized {self_lowercase_count} 'self' values to 'Self' in 'relation' column")
        else:
            print("‚úì No 'self' values to standardize in 'relation' column")
    
    # 4. Fill missing relation values with "Parent"
    if df['relation'].isnull().sum() > 0:
        missing_relation_count = df['relation'].isnull().sum()
        df['relation'].fillna('Parent', inplace=True)
        print(f"‚úì Filled {missing_relation_count} missing 'relation' values with 'Parent'")
    else:
        print("‚úì No missing values in 'relation' column")
    
    # Display missing values after cleaning
    print("\n" + "-"*60)
    print("Missing values AFTER cleaning:")
    missing_after = df.isnull().sum()
    missing_cols_after = missing_after[missing_after > 0]
    if len(missing_cols_after) > 0:
        for col, count in missing_cols_after.items():
            print(f"  {col}: {count} missing values")
    else:
        print("  No missing values remaining!")
    
    # Save cleaned data
    df.to_csv(output_path, index=False)
    print(f"\n{'='*60}")
    print(f"‚úì Cleaned data saved to: {output_path}")
    print(f"{'='*60}\n")
    
    # Display summary statistics
    print("SUMMARY:")
    print(f"  Total rows: {df.shape[0]}")
    print(f"  Total columns: {df.shape[1]}")
    print(f"  Unique values in 'relation' column: {df['relation'].unique()}")
    print(f"  Value counts for 'relation':")
    print(df['relation'].value_counts().to_string())
    
    return df

# Define paths for notebook environment
# Navigate up from notebooks/hammad/ to project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
input_file = os.path.join(project_root, 'data', 'raw', 'autism_screening.csv')
output_file = os.path.join(project_root, 'data', 'clean', 'autism_screening_cleaned.csv')

# Ensure output directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Run the cleaning process
# Close any open file handles to the output file first
import gc
gc.collect()

cleaned_df = clean_missing_values(input_file, output_file)


DATA CLEANING PROCESS

Original dataset shape: 292 rows √ó 21 columns

Missing values BEFORE cleaning:
  age: 4 missing values
  ethnicity: 43 missing values
  relation: 43 missing values

------------------------------------------------------------
APPLYING CLEANING RULES:
------------------------------------------------------------
‚úì Filled 4 missing 'age' values with mode: 4.0
‚úì Filled 43 missing 'ethnicity' values with 'unknown'
‚úì Standardized 1 'self' values to 'Self' in 'relation' column
‚úì Filled 43 missing 'relation' values with 'Parent'

------------------------------------------------------------
Missing values AFTER cleaning:
  No missing values remaining!

‚úì Cleaned data saved to: d:\Labs\ML\ML Project\ChildhoodAutismRiskPrediction\data\clean\autism_screening_cleaned.csv

SUMMARY:
  Total rows: 292
  Total columns: 21
  Unique values in 'relation' column: ['Parent' 'Self' 'Relative' "'Health care professional'"]
  Value counts for 'relation':
relation
Parent       

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(age_mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ethnicity'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alwa

# Additional Data Cleaning
Now we'll fix additional issues:
1. Replace "unknown" with "Other" in ethnicity (since "Other" already exists)
2. Remove single quotes around values in ethnicity, country_of_res, and age_desc
3. Strip leading/trailing spaces from all columns
4. Display unique values for all columns


In [4]:
# Load the cleaned data
df = pd.read_csv(output_file)

print("="*60)
print("ADDITIONAL DATA CLEANING")
print("="*60)
print(f"\nDataset shape: {df.shape[0]} rows √ó {df.shape[1]} columns\n")

# 1. Replace "unknown" with "Other" in ethnicity column
if 'unknown' in df['ethnicity'].values:
    unknown_count = (df['ethnicity'] == 'unknown').sum()
    df['ethnicity'] = df['ethnicity'].replace('unknown', 'Others')
    print(f"‚úì Replaced {unknown_count} 'unknown' values with 'Others' in ethnicity column")
else:
    print("‚úì No 'unknown' values found in ethnicity column")

# 2. Remove single quotes from specific columns
columns_to_clean = ['ethnicity', 'country_of_res', 'age_desc']

for col in columns_to_clean:
    if col in df.columns:
        # Remove leading and trailing single quotes
        df[col] = df[col].astype(str).str.strip("'")
        print(f"‚úì Removed single quotes from '{col}' column")

# 3. Strip leading and trailing spaces from all columns
print("\n" + "-"*60)
print("Removing leading/trailing spaces from all columns...")
print("-"*60)

for col in df.columns:
    if df[col].dtype == 'object':  # Only process string columns
        # Count rows with leading/trailing spaces
        spaces_before = df[col].astype(str).str.len()
        df[col] = df[col].astype(str).str.strip()
        spaces_after = df[col].astype(str).str.len()
        cleaned = (spaces_before != spaces_after).sum()
        
        if cleaned > 0:
            print(f"  ‚úì Cleaned {cleaned} values in '{col}' column")

print("\n" + "="*60)
print("SAVING UPDATED CLEANED DATA")
print("="*60)

# Save the updated cleaned data
df.to_csv(output_file, index=False)
print(f"‚úì Updated cleaned data saved to: {output_file}\n")


ADDITIONAL DATA CLEANING

Dataset shape: 292 rows √ó 21 columns

‚úì Replaced 43 'unknown' values with 'Others' in ethnicity column
‚úì Removed single quotes from 'ethnicity' column
‚úì Removed single quotes from 'country_of_res' column
‚úì Removed single quotes from 'age_desc' column

------------------------------------------------------------
Removing leading/trailing spaces from all columns...
------------------------------------------------------------
  ‚úì Cleaned 27 values in 'ethnicity' column

SAVING UPDATED CLEANED DATA
‚úì Updated cleaned data saved to: d:\Labs\ML\ML Project\ChildhoodAutismRiskPrediction\data\clean\autism_screening_cleaned.csv



# Display Unique Values for All Columns


In [11]:
# Set pandas display options to show all rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Also save the report to a file to ensure nothing is truncated
report_file = os.path.join(project_root, 'reports', 'unique_values_report.txt')
os.makedirs(os.path.dirname(report_file), exist_ok=True)

# Open file for writing
with open(report_file, 'w', encoding='utf-8') as f:
    # Display unique values for all columns
    header = "="*80 + "\nUNIQUE VALUES FOR ALL COLUMNS\n" + "="*80 + "\n"
    print(header)
    f.write(header)
    
    for col in df.columns:
        unique_vals = df[col].unique()
        n_unique = len(unique_vals)
        
        section_header = f"\n{col.upper()} ({n_unique} unique values):\n" + "-" * 80 + "\n"
        print(section_header)
        f.write(section_header)
        
        # Display all unique values
        sorted_vals = sorted([str(val) for val in unique_vals])
        
        # Print each unique value
        if n_unique > 30:
            for i, val in enumerate(sorted_vals, 1):
                line = f"  {i}. {val}\n"
                print(line.strip())
                f.write(line)
        else:
            for val in sorted_vals:
                line = f"  - {val}\n"
                print(line.strip())
                f.write(line)
        
        # Show value counts for categorical columns
        if n_unique <= 60 and df[col].dtype == 'object':
            vc_header = f"\nValue counts:\n"
            print(vc_header)
            f.write(vc_header)
            
            value_counts = df[col].value_counts()
            for val, count in value_counts.items():
                line = f"  {val}: {count}\n"
                print(line.strip())
                f.write(line)
    
    footer = "\n" + "="*80 + "\nDATA CLEANING COMPLETE!\n" + "="*80 + "\n"
    print(footer)
    f.write(footer)

print(f"\n‚úì Full report also saved to: {report_file}")
print(f"‚úì Total columns processed: {len(df.columns)}")
print(f"‚úì Columns: {', '.join(df.columns)}")


UNIQUE VALUES FOR ALL COLUMNS


A1_SCORE (2 unique values):
--------------------------------------------------------------------------------

- 0
- 1

A2_SCORE (2 unique values):
--------------------------------------------------------------------------------

- 0
- 1

A3_SCORE (2 unique values):
--------------------------------------------------------------------------------

- 0
- 1

A4_SCORE (2 unique values):
--------------------------------------------------------------------------------

- 0
- 1

A5_SCORE (2 unique values):
--------------------------------------------------------------------------------

- 0
- 1

A6_SCORE (2 unique values):
--------------------------------------------------------------------------------

- 0
- 1

A7_SCORE (2 unique values):
--------------------------------------------------------------------------------

- 0
- 1

A8_SCORE (2 unique values):
--------------------------------------------------------------------------------

- 0
- 1

A9_SCORE (2 uniq

# Feature Encoding

Now we'll encode categorical features for ML model compatibility:

## Encoding Strategy:

1. **Binary Encoding** (for 2-category columns):
   - gender, jaundice, autism, used_app_before, class
   - Simple 0/1 encoding (efficient for binary features)

2. **Grouped + One-Hot Encoding** (for ethnicity):
   - Keep categories with count >= 10: White-European (108), Others (57), Asian (46), Middle Eastern (27), South Asian (21), Black (14)
   - Merge rare categories (Latino, Hispanic, Pasifika, Turkish) into "Others"
   - Result: 6 categories ‚Üí 6 dummy variables
   - **Why?** Reduces noise from rare categories and prevents overfitting

3. **One-Hot Encoding** (for relation):
   - relation (4 categories)
   - **Best for:** Nominal categories with no ordinal relationship

4. **Top-N + Other Encoding** (for high-cardinality column):
   - country_of_res: Keep top 5, group rest as "Other"
   - **Why?** 52 categories with only 292 rows would create sparse matrix
   - **Rule of thumb:** When categories > 10% of samples, consider grouping

## Columns to DROP:

### ‚ö†Ô∏è Data Leakage - MUST DROP:
- **result**: Sum of A1-A10 scores. If kept, model will cheat using this instead of learning from actual features!

### No Variance - Should DROP:
- **age_desc**: Only has 1 unique value ('4-11 years') - provides no information


In [15]:
# Load the cleaned data
df_encoded = pd.read_csv(output_file)

print("="*80)
print("FEATURE ENCODING PROCESS")
print("="*80)
print(f"\nOriginal shape: {df_encoded.shape}")

# Create encoding report
encoding_report = []
encoding_report.append("="*80)
encoding_report.append("ENCODING REPORT")
encoding_report.append("="*80 + "\n")

# ============================================================================
# 1. BINARY ENCODING (2-category columns)
# ============================================================================
print("\n" + "-"*80)
print("1. BINARY ENCODING (2-category columns)")
print("-"*80)
encoding_report.append("\n1. BINARY ENCODING")
encoding_report.append("-"*80)

binary_columns = {
    'gender': {'m': 1, 'f': 0},
    'jaundice': {'yes': 1, 'no': 0},
    'autism': {'yes': 1, 'no': 0},
    'used_app_before': {'yes': 1, 'no': 0},
    'class': {'YES': 1, 'NO': 0}
}

for col, mapping in binary_columns.items():
    if col in df_encoded.columns:
        df_encoded[col] = df_encoded[col].map(mapping)
        msg = f"‚úì {col}: {mapping}"
        print(f"  {msg}")
        encoding_report.append(f"  {msg}")

# ============================================================================
# 2. TOP-N + OTHER ENCODING for country_of_res
# ============================================================================
print("\n" + "-"*80)
print("2. TOP-5 COUNTRY ENCODING")
print("-"*80)
encoding_report.append("\n\n2. TOP-5 COUNTRY ENCODING")
encoding_report.append("-"*80)

# Get top 5 countries
top_5_countries = df_encoded['country_of_res'].value_counts().head(5).index.tolist()
print(f"  Top 5 countries: {top_5_countries}")
encoding_report.append(f"  Top 5 countries: {top_5_countries}")

# Create a new column with top 5 + Other
df_encoded['country_grouped'] = df_encoded['country_of_res'].apply(
    lambda x: x if x in top_5_countries else 'Other'
)

# Show distribution
grouped_counts = df_encoded['country_grouped'].value_counts()
print(f"\n  Grouped distribution:")
encoding_report.append(f"\n  Grouped distribution:")
for country, count in grouped_counts.items():
    msg = f"    {country}: {count}"
    print(msg)
    encoding_report.append(msg)

# One-hot encode the grouped countries
country_dummies = pd.get_dummies(df_encoded['country_grouped'], prefix='country', dtype=int)
df_encoded = pd.concat([df_encoded, country_dummies], axis=1)

# Drop original columns
df_encoded.drop(['country_of_res', 'country_grouped'], axis=1, inplace=True)
print(f"\n  ‚úì Created {len(country_dummies.columns)} country dummy variables")
print(f"  ‚úì Columns: {list(country_dummies.columns)}")
encoding_report.append(f"\n  ‚úì Created {len(country_dummies.columns)} country dummy variables")
encoding_report.append(f"  ‚úì Columns: {list(country_dummies.columns)}")

# ============================================================================
# 3. GROUPED ENCODING for ethnicity (keep top categories with count >= 10)
# ============================================================================
print("\n" + "-"*80)
print("3. GROUPED ETHNICITY ENCODING (keep categories with count >= 10)")
print("-"*80)
encoding_report.append("\n\n3. GROUPED ETHNICITY ENCODING")
encoding_report.append("-"*80)

if 'ethnicity' in df_encoded.columns:
    # Get ethnicity value counts
    ethnicity_counts = df_encoded['ethnicity'].value_counts()
    print(f"\n  Original ethnicity distribution:")
    encoding_report.append(f"\n  Original ethnicity distribution:")
    for eth, count in ethnicity_counts.items():
        msg = f"    {eth}: {count}"
        print(msg)
        encoding_report.append(msg)
    
    # Keep categories with count >= 10, merge rest into "Others"
    top_ethnicities = ethnicity_counts[ethnicity_counts >= 10].index.tolist()
    print(f"\n  Keeping categories with count >= 10: {top_ethnicities}")
    encoding_report.append(f"\n  Keeping categories with count >= 10: {top_ethnicities}")
    
    # Group ethnicities
    df_encoded['ethnicity_grouped'] = df_encoded['ethnicity'].apply(
        lambda x: x if x in top_ethnicities else 'Others'
    )
    
    # Ensure "Others" is included in the list if not already
    if 'Others' not in top_ethnicities and 'Others' in df_encoded['ethnicity'].values:
        # Others is already in the original data, no need to add
        pass
    
    # Show new distribution
    grouped_eth_counts = df_encoded['ethnicity_grouped'].value_counts()
    print(f"\n  Grouped ethnicity distribution:")
    encoding_report.append(f"\n  Grouped ethnicity distribution:")
    for eth, count in grouped_eth_counts.items():
        msg = f"    {eth}: {count}"
        print(msg)
        encoding_report.append(msg)
    
    # One-hot encode the grouped ethnicity
    n_categories = df_encoded['ethnicity_grouped'].nunique()
    eth_dummies = pd.get_dummies(df_encoded['ethnicity_grouped'], prefix='ethnicity', dtype=int)
    df_encoded = pd.concat([df_encoded, eth_dummies], axis=1)
    df_encoded.drop(['ethnicity', 'ethnicity_grouped'], axis=1, inplace=True)
    
    msg = f"‚úì ethnicity: {n_categories} grouped categories ‚Üí {len(eth_dummies.columns)} dummy variables"
    print(f"\n  {msg}")
    print(f"  Columns: {list(eth_dummies.columns)}")
    encoding_report.append(f"\n  {msg}")
    encoding_report.append(f"  Columns: {list(eth_dummies.columns)}")

# ============================================================================
# 4. ONE-HOT ENCODING for relation
# ============================================================================
print("\n" + "-"*80)
print("4. ONE-HOT ENCODING (relation column)")
print("-"*80)
encoding_report.append("\n\n4. ONE-HOT ENCODING")
encoding_report.append("-"*80)

# First, fix the relation column - remove quotes from 'Health care professional'
if 'relation' in df_encoded.columns:
    df_encoded['relation'] = df_encoded['relation'].str.strip("'")
    print("  ‚úì Cleaned 'relation' column (removed quotes)")

# One-hot encode relation
if 'relation' in df_encoded.columns:
    n_categories = df_encoded['relation'].nunique()
    dummies = pd.get_dummies(df_encoded['relation'], prefix='relation', dtype=int)
    df_encoded = pd.concat([df_encoded, dummies], axis=1)
    df_encoded.drop('relation', axis=1, inplace=True)
    
    msg = f"‚úì relation: {n_categories} categories ‚Üí {len(dummies.columns)} dummy variables"
    print(f"  {msg}")
    print(f"    Columns: {list(dummies.columns)}")
    encoding_report.append(f"  {msg}")
    encoding_report.append(f"    Columns: {list(dummies.columns)}")

# ============================================================================
# 5. DROP LEAKAGE AND CONSTANT COLUMNS
# ============================================================================
print("\n" + "-"*80)
print("5. DROPPING LEAKAGE AND CONSTANT COLUMNS")
print("-"*80)
encoding_report.append("\n\n5. DROPPING LEAKAGE AND CONSTANT COLUMNS")
encoding_report.append("-"*80)

columns_to_drop = []

# Drop result column (data leakage - it's sum of A1-A10)
if 'result' in df_encoded.columns:
    columns_to_drop.append('result')
    msg = "‚úì Dropped 'result' (DATA LEAKAGE: sum of A1-A10 scores)"
    print(f"  {msg}")
    encoding_report.append(f"  {msg}")
    encoding_report.append(f"    Reason: Would give model perfect shortcut, preventing real learning")

# Drop age_desc (constant column)
if 'age_desc' in df_encoded.columns:
    columns_to_drop.append('age_desc')
    msg = "‚úì Dropped 'age_desc' (CONSTANT: only 1 unique value - no variance)"
    print(f"  {msg}")
    encoding_report.append(f"  {msg}")

if columns_to_drop:
    df_encoded.drop(columns_to_drop, axis=1, inplace=True)
    print(f"\n  Total columns dropped: {len(columns_to_drop)}")
    encoding_report.append(f"\n  Total columns dropped: {len(columns_to_drop)}")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "="*80)
print("ENCODING SUMMARY")
print("="*80)
encoding_report.append("\n\n" + "="*80)
encoding_report.append("ENCODING SUMMARY")
encoding_report.append("="*80)

print(f"  Final shape: {df_encoded.shape}")
print(f"  Original columns: 21")
print(f"  Final columns: {df_encoded.shape[1]}")
print(f"  Columns removed: {21 - df_encoded.shape[1] + (df_encoded.shape[1] - 21)}")

encoding_report.append(f"  Final shape: {df_encoded.shape}")
encoding_report.append(f"  Original columns: 21")
encoding_report.append(f"  Final columns: {df_encoded.shape[1]}")

print(f"\n  All columns:")
encoding_report.append(f"\n  All columns:")
for i, col in enumerate(df_encoded.columns, 1):
    print(f"    {i}. {col}")
    encoding_report.append(f"    {i}. {col}")

# Verify no data leakage
print(f"\n  ‚ö†Ô∏è DATA LEAKAGE CHECK:")
encoding_report.append(f"\n  ‚ö†Ô∏è DATA LEAKAGE CHECK:")
if 'result' in df_encoded.columns:
    warning = "  ‚ùå WARNING: 'result' column still present - REMOVE IT!"
    print(warning)
    encoding_report.append(warning)
else:
    success = "  ‚úÖ SUCCESS: 'result' column removed - no data leakage"
    print(success)
    encoding_report.append(success)

# Save encoded data
encoded_output = os.path.join(project_root, 'data', 'clean', 'autism_screening_encoded.csv')
df_encoded.to_csv(encoded_output, index=False)
print(f"\n‚úì Encoded data saved to: {encoded_output}")

# Save encoding report
report_path = os.path.join(project_root, 'reports', 'encoding_report.txt')
with open(report_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(encoding_report))
print(f"‚úì Encoding report saved to: {report_path}")

print("\n" + "="*80)
print("ENCODING COMPLETE!")
print("="*80)


FEATURE ENCODING PROCESS

Original shape: (292, 21)

--------------------------------------------------------------------------------
1. BINARY ENCODING (2-category columns)
--------------------------------------------------------------------------------
  ‚úì gender: {'m': 1, 'f': 0}
  ‚úì jaundice: {'yes': 1, 'no': 0}
  ‚úì autism: {'yes': 1, 'no': 0}
  ‚úì used_app_before: {'yes': 1, 'no': 0}
  ‚úì class: {'YES': 1, 'NO': 0}

--------------------------------------------------------------------------------
2. TOP-5 COUNTRY ENCODING
--------------------------------------------------------------------------------
  Top 5 countries: ['United Kingdom', 'United States', 'India', 'Australia', 'Jordan']

  Grouped distribution:
    Other: 116
    United Kingdom: 49
    India: 42
    United States: 42
    Australia: 23
    Jordan: 20

  ‚úì Created 6 country dummy variables
  ‚úì Columns: ['country_Australia', 'country_India', 'country_Jordan', 'country_Other', 'country_United Kingdom', 'cou

# Is This the Best Encoding Strategy?

## ‚úÖ Yes, this approach is good for most ML models because:

### Binary Encoding (0/1):
- **Efficient**: Single column per feature
- **Works with**: Linear models, tree-based models, neural networks
- **Best for**: Features with natural binary states (yes/no, male/female)

### One-Hot Encoding:
- **Prevents false ordinal relationships**: Each category is independent
- **Works with**: Linear models (Logistic Regression, Linear SVM), Neural Networks
- **Already optimized for**: Tree-based models (Random Forest, XGBoost, etc.) can handle categorical data directly, but one-hot encoding still works well

### Top-N + Other Strategy:
- **Reduces dimensionality**: 52 columns ‚Üí 6 columns (manageable)
- **Prevents overfitting**: Too many sparse categories can hurt model performance
- **Captures majority patterns**: Top 5 countries represent ~60% of data

## üìä Model Considerations:

**Tree-based models** (Random Forest, XGBoost, CatBoost):
- ‚úÖ Work excellently with this encoding
- ‚ÑπÔ∏è CatBoost can handle categorical features natively, but this encoding still works

**Linear models** (Logistic Regression, SVM):
- ‚úÖ Require this type of encoding
- ‚ö†Ô∏è May need feature scaling for numeric columns (age, result)

**Neural Networks**:
- ‚úÖ Work well with this encoding
- ‚ö†Ô∏è Definitely need feature scaling

## üí° Alternative Approach:
If using **CatBoost**, you could keep categorical columns as-is and let it handle them natively. But your current approach is **universal and production-ready** for all ML models!
