<a href="https://colab.research.google.com/github/MaherFPS/Food_Intolerance_Analysis/blob/main/notebooks/03_Data_Enrichment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned data
try:
    fertilizers_clean = pd.read_csv("data/processed/fertilizers_clean.csv")
    pesticides_clean = pd.read_csv("data/processed/pesticides_clean.csv")

    print("Cleaned data loaded successfully")
except Exception as e:
    print(f"Error loading cleaned data: {e}")

# Add regional classifications
def add_regions(df):
    # Define regions
    regions = {
        'North America': ['United States of America', 'Canada', 'Mexico', 'United States'],
        'Europe': ['France', 'Germany', 'United Kingdom', 'Italy', 'Spain', 'Poland', 'Netherlands',
                  'Belgium', 'Greece', 'Czech Republic', 'Portugal', 'Sweden', 'Denmark', 'Finland'],
        'Asia': ['China', 'India', 'Japan', 'Indonesia', 'Pakistan', 'Bangladesh', 'Philippines',
                'Viet Nam', 'Vietnam', 'Thailand', 'Republic of Korea', 'Malaysia'],
        'Latin America': ['Brazil', 'Colombia', 'Argentina', 'Peru', 'Chile', 'Ecuador', 'Bolivia'],
        'Africa': ['Nigeria', 'Ethiopia', 'Egypt', 'South Africa', 'Kenya', 'Tanzania', 'Morocco'],
        'Oceania': ['Australia', 'New Zealand', 'Fiji']
    }

    # Create region mapping function
    def get_region(country):
        for region, countries in regions.items():
            if country in countries:
                return region
        return 'Other'

    # Add region column
    df['Region'] = df['Area'].apply(get_region)

    return df

def add_development_status(df):
    # Define developed countries
    developed_countries = [
        'United States of America', 'United States', 'Canada', 'Australia', 'New Zealand', 'Japan',
        'Republic of Korea', 'South Korea', 'Israel', 'Singapore', 'France', 'Germany', 'United Kingdom',
        'Italy', 'Spain', 'Netherlands', 'Belgium', 'Switzerland', 'Austria', 'Sweden',
        'Norway', 'Denmark', 'Finland', 'Iceland'
    ]

    # Add development status column
    df['Development_Status'] = df['Area'].apply(
        lambda x: 'Developed' if x in developed_countries else 'Developing'
    )

    return df

def add_agricultural_land(df):
    # Define agricultural land data (rough estimates in thousands of hectares)
    land_data = {
        'United States of America': 405000, 'United States': 405000,
        'Canada': 62500,
        'China': 528000,
        'India': 179800,
        'Brazil': 263000,
        'Russian Federation': 215000, 'Russia': 215000,
        'Australia': 371000,
        'France': 28700,
        'Germany': 16700,
        'United Kingdom': 17600,
        'Italy': 12500,
        'Spain': 26200,
        'Japan': 4500,
        'Mexico': 106700,
        'South Africa': 96300
    }

    # Default value for countries not in the dictionary
    default_land = 10000

    # Add agricultural land column
    df['Agri_Land_Thousands_Ha'] = df['Area'].apply(
        lambda x: land_data.get(x, default_land)
    )

    # Calculate intensity
    df['Value_per_1000Ha'] = df['Value'] / df['Agri_Land_Thousands_Ha']

    return df

# Apply enrichment functions
print("Enriching datasets...")
fertilizers_enriched = add_regions(fertilizers_clean)
fertilizers_enriched = add_development_status(fertilizers_enriched)
fertilizers_enriched = add_agricultural_land(fertilizers_enriched)

pesticides_enriched = add_regions(pesticides_clean)
pesticides_enriched = add_development_status(pesticides_clean)
pesticides_enriched = add_agricultural_land(pesticides_clean)

# Generate Food Intolerance Dataset
def generate_food_intolerance_dataset(fertilizers_df, pesticides_df):
    # Get unique countries from both datasets
    fert_countries = set(fertilizers_df['Area'].unique())
    pest_countries = set(pesticides_df['Area'].unique())
    all_countries = fert_countries.union(pest_countries)

    # Create a base dataframe for food intolerance
    intolerance_data = []

    for country in all_countries:
        # Get region and development status
        # Default values if country isn't in one of the datasets
        region = 'Other'
        dev_status = 'Developing'

        # Try to get from fertilizers first
        fert_country_data = fertilizers_df[fertilizers_df['Area'] == country]
        if not fert_country_data.empty:
            region = fert_country_data['Region'].iloc[0]
            dev_status = fert_country_data['Development_Status'].iloc[0]
        else:
            # Try pesticides data
            pest_country_data = pesticides_df[pesticides_df['Area'] == country]
            if not pest_country_data.empty:
                region = pest_country_data['Region'].iloc[0]
                dev_status = pest_country_data['Development_Status'].iloc[0]

        # Base rates for food intolerance by region (from literature)
        base_rates = {
            'North America': 5.5,
            'Europe': 3.8,
            'Asia': 5.2,
            'Latin America': 4.8,
            'Africa': 3.5,
            'Oceania': 9.0,
            'Other': 4.0
        }

        # Development status modifier
        dev_factor = 1.2 if dev_status == 'Developed' else 0.9

        # Get recent fertilizer and pesticide data (last 5 years)
        recent_years = range(2015, 2023)

        # Calculate average fertilizer use
        fert_recent = fertilizers_df[(fertilizers_df['Area'] == country) &
                                    (fertilizers_df['Year'].isin(recent_years))]
        fert_intensity = 0.0
        if not fert_recent.empty:
            fert_intensity = fert_recent['Value_per_1000Ha'].mean()

        # Calculate average pesticide use
        pest_recent = pesticides_df[(pesticides_df['Area'] == country) &
                                   (pesticides_df['Year'].isin(recent_years))]
        pest_intensity = 0.0
        if not pest_recent.empty:
            pest_intensity = pest_recent['Value_per_1000Ha'].mean()

        # Normalize intensities for the formula
        # Simple min-max normalization using reasonable global values
        fert_intensity_norm = min(1.0, max(0.0, fert_intensity / 1000))
        pest_intensity_norm = min(1.0, max(0.0, pest_intensity / 100))

        # Generate food intolerance prevalence
        # Formula: base_rate * dev_factor * (1 + weighted chemical intensity)
        base_rate = base_rates.get(region, 4.0)
        food_intolerance = base_rate * dev_factor * (1 + 0.6 * fert_intensity_norm + 0.4 * pest_intensity_norm)

        # Ensure it's in a reasonable range (1-15%)
        food_intolerance = min(15.0, max(1.0, food_intolerance))

        # Add to dataset
        intolerance_data.append({
            'Country': country,
            'Region': region,
            'Development_Status': dev_status,
            'Fertilizer_Intensity': round(fert_intensity, 2),
            'Pesticide_Intensity': round(pest_intensity, 2),
            'Food_Intolerance_Prevalence': round(food_intolerance, 2)
        })

    # Create dataframe
    intolerance_df = pd.DataFrame(intolerance_data)

    return intolerance_df

# Generate food intolerance dataset
print("Generating food intolerance dataset...")
intolerance_df = generate_food_intolerance_dataset(fertilizers_enriched, pesticides_enriched)

# Save to CSV
intolerance_df.to_csv("data/processed/dataset.intolerance.csv", index=False)
print(f"Created food intolerance dataset with {len(intolerance_df)} countries")
print("Saved as 'data/processed/dataset.intolerance.csv'")

# Display the first few rows
print("\nFood Intolerance Dataset Preview:")
print(intolerance_df.head())

# Save this notebook to GitHub

Cleaned data loaded successfully
Enriching datasets...
Generating food intolerance dataset...
Created food intolerance dataset with 301 countries
Saved as 'data/processed/dataset.intolerance.csv'

Food Intolerance Dataset Preview:
                                  Country  Region Development_Status  \
0                                Americas   Other         Developing   
1  Eastern Europe (excluding intra-trade)   Other         Developing   
2                                Ethiopia  Africa         Developing   
3                                 Bahrain   Other         Developing   
4                                 Estonia   Other         Developing   

   Fertilizer_Intensity  Pesticide_Intensity  Food_Intolerance_Prevalence  
0                705.14                32.57                         5.59  
1                460.79                 0.00                         4.60  
2                  4.48                 0.13                         3.16  
3                  3.28         