In [1]:
#Mount Google Drive if running in Colab

from google.colab import drive
drive.mount('/content/drive/')

filepath = '/content/drive/MyDrive/DS_Capstone/'

Mounted at /content/drive/


In [3]:
#Import libraries and set graphics styles

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [4]:
# Load X/Twitter data
print("\n1.2 Loading X/Twitter data...")
twitter_file = filepath + "Bird_flu/_H5N1_ -is_retweet since_2022-06-01 until_2025-01-01 lang_en.csv"
twitter = pd.read_csv(twitter_file, low_memory=False)
twitter['createdAt'] = pd.to_datetime(twitter['createdAt'], utc=True, errors='coerce')
twitter.dropna(subset=['createdAt'], inplace=True) # Drop rows with NaT in 'createdAt'
twitter['createdAt'] = twitter['createdAt'].dt.tz_convert(None)
twitter['date'] = twitter['createdAt'].dt.date
twitter = twitter.sort_values('createdAt')


1.2 Loading X/Twitter data...
✓ Loaded 183,185 tweets
  Date range: 2022-06-01 01:19:03 to 2024-12-31 23:59:41


In [6]:
twitter

Unnamed: 0,_id,userId,userName,alias,profilePicture,userCreatedAt,bio,followers,following,lists,...,images,impressions,isImage,firstImage,location,url,noteTweetText,retweetedUserId,reportName,date
20838,1531807504845398017,14777416,Krissy Klabacha,presquevoo,https://pbs.twimg.com/profile_images/264804069...,2008-05-14T19:35:08.000Z,talker; typer; consumer; thinker; dancing pupp...,89,425,0,...,[],0.0,False,,Chicagoland mostly,https://x.com/user/status/1531807504845398017,,,,2022-06-01
20837,1531822538396946437,1204093660880678913,Md. Ekramul Karim,MdEkramulKarim1,https://pbs.twimg.com/profile_images/120409419...,2019-12-09T17:41:36.000Z,Microbiome; Host-Microbe Interactions; Microbi...,435,2094,4,...,[],0.0,False,,Houston TX,https://x.com/user/status/1531822538396946437,,,,2022-06-01
20836,1531877472253448192,18932416,Louise Hosie,louise_hosie,https://pbs.twimg.com/profile_images/167849633...,2009-01-13T07:45:06.000Z,Senior Journalist @BBCScotlandNews. Ex-@stvnew...,3947,5053,127,...,[],0.0,False,,Scotland United Kingdom,https://x.com/user/status/1531877472253448192,,,,2022-06-01
20835,1531921969582596096,239996081,Nigel Bennett,top1percentile,https://pbs.twimg.com/profile_images/122098430...,2011-01-18T22:49:05.000Z,Scientist technologist and maverick philosophe...,7642,1918,69,...,[],0.0,False,,England,https://x.com/user/status/1531921969582596096,,,,2022-06-01
20834,1531930248597622785,1464367421217718275,Margaret Taylor PhD,MgtTaylor1,https://pbs.twimg.com/profile_images/146437284...,2021-11-26T22:56:50.000Z,Cell Biol Uni St Andrews/Influenza/Science/Cli...,261,1774,1,...,[],0.0,False,,Scotland United Kingdom,https://x.com/user/status/1531930248597622785,,,,2022-06-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121502,1874243879769428458,1728929146833625088,Mary Kay,MMahon43573,https://pbs.twimg.com/profile_images/174551449...,2023-11-27T00:10:18.000Z,Curiosity killed the cat; but answers always b...,276,869,1,...,[],8132.0,False,,,https://x.com/user/status/1874243879769428458,,,,2024-12-31
121501,1874243965203210478,2148517162,Thomas,kotkowskitj,https://pbs.twimg.com/profile_images/378800000...,2013-10-22T13:51:03.000Z,Retired Marine. Truth teller not afraid of the...,2294,1656,5,...,[],605.0,False,,Massachusetts,https://x.com/user/status/1874243965203210478,,,,2024-12-31
121500,1874243968466379120,1448822071803277315,Kate Pritchard,antiglobalism10,https://pbs.twimg.com/profile_images/185411257...,2021-10-15T01:25:08.000Z,Atheist British/AustralianEx Aid Worker Wester...,2313,1219,29,...,[],17435.0,False,,🇦🇺🇬🇧🇮🇱🇺🇸,https://x.com/user/status/1874243968466379120,,,,2024-12-31
121499,1874243979505766755,15910110,Shana Bates RN,shanabates,https://pbs.twimg.com/profile_images/468412917...,2008-08-19T21:50:46.000Z,Registered Nurse,1421,4321,20,...,[],36.0,False,,Deep South,https://x.com/user/status/1874243979505766755,,,,2024-12-31


In [7]:
twitter.shape

(183185, 33)

In [9]:
pip install geopy pycountry

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-24.6.1


In [37]:
"""
Twitter Location Extraction - CORRECTED VERSION
Fix: Proper dataframe application to avoid row misalignment
"""

import pandas as pd
import re
from typing import Optional, Dict


class LocationExtractor:
    """Extract and validate locations from Twitter location strings"""

    def __init__(self):
        # Invalid patterns to filter out
        self.invalid_patterns = [
            r'\b(everywhere|nowhere|internet|online|cyberspace|virtual|worldwide)\b',
            r'\b(earth|world|globe|universe|metaverse)\b',
            r'\b(on-chain|in the club)\b',
            r'international website',
        ]

        # Location aliases and normalizations
        self.aliases = {
            'chicagoland': 'Chicago',
            'chicagoland mostly': 'Chicago, Illinois',
            'chicagoland, mostly': 'Chicago, Illinois',
            'the free state of florida': 'Florida',
            'houston tx': 'Houston, Texas',
            'houston, tx': 'Houston, Texas',
            'washington dc': 'Washington DC',
            'washington, dc': 'Washington DC',
            'vancouver': 'Vancouver, Canada',
            'scotland united kingdom': 'Scotland, UK',
            'scotland, united kingdom': 'Scotland, UK',
            'england': 'England, UK',
            'uk': 'United Kingdom',
            'u.k.': 'United Kingdom',
            'gb': 'United Kingdom',
            'usa': 'United States',
            'us': 'United States',
            'u.s.': 'United States',
            'u.s.a.': 'United States',
            'uae': 'United Arab Emirates',
            'u.a.e.': 'United Arab Emirates',
            'ksa': 'Saudi Arabia',
            'nz': 'New Zealand',
            'oz': 'Australia',
            'aus': 'Australia',
        }

        # Known countries for extraction
        self.countries = {
            'United States', 'Canada', 'United Kingdom', 'Australia', 'England',
            'Scotland', 'Wales', 'Ireland', 'India', 'China', 'Japan', 'Germany',
            'France', 'Italy', 'Spain', 'Netherlands', 'Mexico', 'Brazil',
            'Philippines', 'Singapore', 'New Zealand', 'Argentina', 'Chile',
            'South Africa', 'Kenya', 'Nigeria', 'Egypt', 'Israel', 'UAE',
            'Saudi Arabia', 'Turkey', 'Russia', 'Ukraine', 'Greece', 'Portugal',
            'Belgium', 'Sweden', 'Norway', 'Denmark', 'Finland', 'Switzerland',
            'Austria', 'Poland', 'Czech Republic', 'Hungary', 'Romania',
            'South Korea', 'Thailand', 'Vietnam', 'Malaysia', 'Indonesia',
            'Pakistan', 'Bangladesh', 'Sri Lanka', 'Nepal', 'Myanmar',
            'Colombia', 'Peru', 'Venezuela', 'Ecuador', 'Bolivia',
            'Morocco', 'Algeria', 'Tunisia', 'Ghana', 'Ethiopia',
            'Taiwan', 'Hong Kong', 'Macau'
        }

        # US States (full names)
        self.us_states = {
            'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
            'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
            'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
            'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
            'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
            'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
            'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
            'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
            'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
            'West Virginia', 'Wisconsin', 'Wyoming'
        }

        # US State abbreviations mapping
        self.state_abbrev = {
            'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
            'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
            'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
            'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
            'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
            'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
            'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
            'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
            'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
            'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
            'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
            'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
            'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'Washington DC'
        }

        # Major cities
        self.cities = {
            'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia',
            'San Diego', 'Dallas', 'Austin', 'Seattle', 'Denver', 'Boston',
            'Portland', 'Toronto', 'Vancouver', 'Montreal', 'London', 'Sydney',
            'Melbourne', 'Cairo', 'Singapore', 'Washington', 'Tampa', 'Manila',
            'Jacksonville', 'Beaumont'
        }

    def is_invalid(self, location: str) -> bool:
        """Check if location matches invalid patterns"""
        if not location or pd.isna(location) or str(location).strip() == '':
            return True

        location_lower = str(location).lower().strip()

        # Check invalid patterns
        for pattern in self.invalid_patterns:
            if re.search(pattern, location_lower, re.IGNORECASE):
                return True

        return False

    def normalize(self, location: str) -> Optional[str]:
        """Normalize location using aliases and handle abbreviations"""
        if not location or pd.isna(location):
            return None

        location_clean = str(location).strip()
        location_lower = location_clean.lower()

        # Check exact aliases first
        if location_lower in self.aliases:
            return self.aliases[location_lower]

        # Clean up spacing and punctuation
        location_clean = re.sub(r'\s+', ' ', location_clean).strip()

        # Handle state abbreviations (e.g., "IL", "N.C.", "TX")
        # Split by comma to preserve structure
        parts = [p.strip() for p in location_clean.split(',')]
        normalized_parts = []

        for part in parts:
            # Remove periods and check if it's a state abbreviation
            part_no_periods = part.replace('.', '').upper()
            if part_no_periods in self.state_abbrev:
                normalized_parts.append(self.state_abbrev[part_no_periods])
            else:
                normalized_parts.append(part)

        location_clean = ', '.join(normalized_parts)

        # Handle partial country names (e.g., "South of England" → "England")
        for country in self.countries:
            if country.lower() in location_clean.lower():
                # Replace the full phrase with just the country name
                location_clean = re.sub(
                    rf'\b(north|south|east|west|kingdom|republic)\s+(of\s+)?{re.escape(country.lower())}\b',
                    country,
                    location_clean,
                    flags=re.IGNORECASE
                )
                location_clean = re.sub(
                    rf'\b{re.escape(country.lower())}\b',
                    country,
                    location_clean,
                    flags=re.IGNORECASE
                )

        # Handle "Kingdom of [Country]" pattern (e.g., "Kingdom of Saudi Arabia")
        location_clean = re.sub(
            r'\b(kingdom|republic|federation|state)\s+of\s+',
            '',
            location_clean,
            flags=re.IGNORECASE
        )

        return location_clean

    def extract_components(self, normalized: str) -> Dict[str, Optional[str]]:
        """Extract city, state, country from normalized location"""
        result = {
            'city': None,
            'state': None,
            'country': None
        }

        if not normalized:
            return result

        # Split by comma
        parts = [p.strip() for p in normalized.split(',')]

        # Check each part
        for part in parts:
            if part in self.countries:
                result['country'] = part
            if part in self.us_states:
                result['state'] = part
                if not result['country']:
                    result['country'] = 'United States'
            if part in self.cities:
                result['city'] = part

        return result

    def process_location(self, location: str) -> Dict:
        """
        Main processing function for a single location
        Returns a dictionary with all extracted information
        """
        # Initialize result
        result = {
            'loc_is_valid': False,
            'loc_normalized': None,
            'loc_city': None,
            'loc_state': None,
            'loc_country': None,
            'loc_confidence': 'invalid',
            'loc_method': 'pattern_filter'
        }

        # Step 1: Check if invalid
        if self.is_invalid(location):
            return result

        # Step 2: Normalize
        normalized = self.normalize(location)
        if not normalized:
            result['loc_method'] = 'normalization_failed'
            return result

        result['loc_normalized'] = normalized
        result['loc_is_valid'] = True
        result['loc_method'] = 'normalized_only'

        # Step 3: Extract components
        components = self.extract_components(normalized)
        result['loc_city'] = components['city']
        result['loc_state'] = components['state']
        result['loc_country'] = components['country']

        # Step 4: Set confidence
        if components['country'] or components['state'] or components['city']:
            result['loc_confidence'] = 'high'
            result['loc_method'] = 'extracted'
        else:
            result['loc_confidence'] = 'medium'

        return result


def process_dataframe(df: pd.DataFrame,
                     location_col: str = 'userLocation') -> pd.DataFrame:
    """
    Process entire dataframe - CORRECT METHOD

    Args:
        df: DataFrame with Twitter data
        location_col: Name of location column

    Returns:
        DataFrame with additional location columns
    """
    extractor = LocationExtractor()

    print(f"Processing {len(df)} rows...")
    print(f"Location column: '{location_col}'")

    # CRITICAL: Process each location and store ALL results at once
    # This prevents row misalignment
    results_list = []

    for idx, row in df.iterrows():
        location = row.get(location_col)
        result = extractor.process_location(location)
        results_list.append(result)

        if (idx + 1) % 1000 == 0:
            print(f"  Processed {idx + 1}/{len(df)} rows...")

    # Convert results to DataFrame
    results_df = pd.DataFrame(results_list)

    # Add location columns to original dataframe
    df_output = df.copy()
    for col in results_df.columns:
        df_output[col] = results_df[col].values  # Use .values to ensure alignment

    # Summary
    valid_count = df_output['loc_is_valid'].sum()
    high_conf = (df_output['loc_confidence'] == 'high').sum()

    print(f"\n✓ Complete!")
    print(f"  Valid locations: {valid_count}/{len(df)} ({100*valid_count/len(df):.1f}%)")
    print(f"  High confidence: {high_conf}")

    return df_output



In [38]:
result = process_dataframe(twitter, location_col='userLocation')

print(result[['userLocation', 'loc_normalized', 'loc_is_valid']])


Processing 183185 rows...
Location column: 'userLocation'
  Processed 20000/183185 rows...
  Processed 15000/183185 rows...
  Processed 17000/183185 rows...
  Processed 121000/183185 rows...
  Processed 120000/183185 rows...
  Processed 52000/183185 rows...
  Processed 51000/183185 rows...
  Processed 152000/183185 rows...
  Processed 14000/183185 rows...
  Processed 18000/183185 rows...
  Processed 183000/183185 rows...
  Processed 182000/183185 rows...
  Processed 181000/183185 rows...
  Processed 180000/183185 rows...
  Processed 179000/183185 rows...
  Processed 178000/183185 rows...
  Processed 177000/183185 rows...
  Processed 176000/183185 rows...
  Processed 175000/183185 rows...
  Processed 174000/183185 rows...
  Processed 173000/183185 rows...
  Processed 172000/183185 rows...
  Processed 171000/183185 rows...
  Processed 106000/183185 rows...
  Processed 105000/183185 rows...
  Processed 104000/183185 rows...
  Processed 103000/183185 rows...
  Processed 102000/183185 rows.

In [39]:
result

Unnamed: 0,_id,userId,userName,alias,profilePicture,userCreatedAt,bio,followers,following,lists,...,retweetedUserId,reportName,date,loc_is_valid,loc_normalized,loc_city,loc_state,loc_country,loc_confidence,loc_method
20838,1531807504845398017,14777416,Krissy Klabacha,presquevoo,https://pbs.twimg.com/profile_images/264804069...,2008-05-14T19:35:08.000Z,talker; typer; consumer; thinker; dancing pupp...,89,425,0,...,,,2022-06-01,True,"Chicago, Illinois",Chicago,Illinois,United States,high,extracted
20837,1531822538396946437,1204093660880678913,Md. Ekramul Karim,MdEkramulKarim1,https://pbs.twimg.com/profile_images/120409419...,2019-12-09T17:41:36.000Z,Microbiome; Host-Microbe Interactions; Microbi...,435,2094,4,...,,,2022-06-01,True,"Houston, Texas",Houston,Texas,United States,high,extracted
20836,1531877472253448192,18932416,Louise Hosie,louise_hosie,https://pbs.twimg.com/profile_images/167849633...,2009-01-13T07:45:06.000Z,Senior Journalist @BBCScotlandNews. Ex-@stvnew...,3947,5053,127,...,,,2022-06-01,True,"Scotland, UK",,,Scotland,high,extracted
20835,1531921969582596096,239996081,Nigel Bennett,top1percentile,https://pbs.twimg.com/profile_images/122098430...,2011-01-18T22:49:05.000Z,Scientist technologist and maverick philosophe...,7642,1918,69,...,,,2022-06-01,True,"England, UK",,,England,high,extracted
20834,1531930248597622785,1464367421217718275,Margaret Taylor PhD,MgtTaylor1,https://pbs.twimg.com/profile_images/146437284...,2021-11-26T22:56:50.000Z,Cell Biol Uni St Andrews/Influenza/Science/Cli...,261,1774,1,...,,,2022-06-01,True,"Scotland, UK",,,Scotland,high,extracted
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121502,1874243879769428458,1728929146833625088,Mary Kay,MMahon43573,https://pbs.twimg.com/profile_images/174551449...,2023-11-27T00:10:18.000Z,Curiosity killed the cat; but answers always b...,276,869,1,...,,,2024-12-31,False,,,,,invalid,pattern_filter
121501,1874243965203210478,2148517162,Thomas,kotkowskitj,https://pbs.twimg.com/profile_images/378800000...,2013-10-22T13:51:03.000Z,Retired Marine. Truth teller not afraid of the...,2294,1656,5,...,,,2024-12-31,True,Massachusetts,,Massachusetts,United States,high,extracted
121500,1874243968466379120,1448822071803277315,Kate Pritchard,antiglobalism10,https://pbs.twimg.com/profile_images/185411257...,2021-10-15T01:25:08.000Z,Atheist British/AustralianEx Aid Worker Wester...,2313,1219,29,...,,,2024-12-31,True,🇦🇺🇬🇧🇮🇱🇺🇸,,,,medium,normalized_only
121499,1874243979505766755,15910110,Shana Bates RN,shanabates,https://pbs.twimg.com/profile_images/468412917...,2008-08-19T21:50:46.000Z,Registered Nurse,1421,4321,20,...,,,2024-12-31,True,Deep South,,,,medium,normalized_only


In [40]:
result.to_csv('result.csv', index = False)

In [41]:
"""
Twitter Location Extraction - SMART VERSION
Uses geocoding APIs instead of manual lists for comprehensive coverage
"""

import pandas as pd
import re
from typing import Optional, Dict
import time

# Install these if needed: pip install geopy pycountry
try:
    from geopy.geocoders import Nominatim
    from geopy.exc import GeocoderTimedOut, GeocoderServiceError
    import pycountry
    GEOCODING_AVAILABLE = True
except ImportError:
    GEOCODING_AVAILABLE = False
    print("⚠️  For best results, install: pip install geopy pycountry")


class SmartLocationExtractor:
    """
    Smart location extractor that uses geocoding APIs instead of manual lists.
    Falls back to pattern matching when APIs are unavailable.
    """

    def __init__(self, use_geocoding: bool = True):
        self.use_geocoding = use_geocoding and GEOCODING_AVAILABLE

        if self.use_geocoding:
            # Nominatim is free and doesn't require API key
            self.geolocator = Nominatim(
                user_agent="twitter_location_extractor",
                timeout=3
            )
            print("✓ Geocoding enabled (OpenStreetMap/Nominatim)")
        else:
            print("⚠️  Geocoding disabled - using pattern matching only")

        # Only patterns we need to filter invalid locations
        self.invalid_patterns = [
            r'\b(everywhere|nowhere|internet|online|cyberspace|virtual|worldwide)\b',
            r'\b(earth|world|globe|universe|metaverse)\b',
            r'\b(on-chain|in the club|web3|blockchain)\b',
            r'international website',
            r'\b(heaven|hell|your (mom|heart|mind))\b',
        ]

        # Minimal aliases for common misspellings/variations
        self.quick_fixes = {
            'usa': 'United States',
            'us': 'United States',
            'uk': 'United Kingdom',
            'uae': 'United Arab Emirates',
            'nyc': 'New York City',
            'sf': 'San Francisco',
            'la': 'Los Angeles',
            'dc': 'Washington DC',
        }

    def is_invalid(self, location: str) -> bool:
        """Quick filter for obviously invalid locations"""
        if not location or pd.isna(location) or str(location).strip() == '':
            return True

        location_lower = str(location).lower().strip()

        # Filter out obvious non-locations
        if len(location_lower) < 2:
            return True

        for pattern in self.invalid_patterns:
            if re.search(pattern, location_lower, re.IGNORECASE):
                return True

        return False

    def clean_location(self, location: str) -> str:
        """Basic cleanup before geocoding"""
        if not location:
            return ""

        # Convert to string and strip
        loc = str(location).strip()

        # Check quick fixes
        loc_lower = loc.lower()
        if loc_lower in self.quick_fixes:
            return self.quick_fixes[loc_lower]

        # Remove emoji and excessive special characters
        loc = re.sub(r'[^\w\s,.-]', '', loc, flags=re.UNICODE)

        # Normalize whitespace
        loc = re.sub(r'\s+', ' ', loc).strip()

        # Remove "Kingdom/Republic of" prefix (helps geocoding)
        loc = re.sub(r'^(kingdom|republic|state|federation)\s+of\s+', '', loc, flags=re.IGNORECASE)

        return loc

    def geocode_location(self, location: str) -> Optional[Dict]:
        """
        Use OpenStreetMap/Nominatim to geocode location.
        Returns structured data about the location.
        """
        if not self.use_geocoding:
            return None

        try:
            result = self.geolocator.geocode(
                location,
                addressdetails=True,
                language='en',
                exactly_one=True
            )

            if result:
                address = result.raw.get('address', {})

                # Extract components from geocoder result
                return {
                    'formatted': result.address,
                    'latitude': result.latitude,
                    'longitude': result.longitude,
                    'city': (address.get('city') or
                            address.get('town') or
                            address.get('village') or
                            address.get('municipality')),
                    'state': (address.get('state') or
                             address.get('region') or
                             address.get('province')),
                    'country': address.get('country'),
                    'country_code': address.get('country_code', '').upper(),
                    'confidence': 'high',
                    'method': 'geocoded'
                }

        except (GeocoderTimedOut, GeocoderServiceError):
            pass
        except Exception as e:
            # Silently fail on any other errors
            pass

        return None

    def extract_country_fallback(self, location: str) -> Optional[str]:
        """
        Fallback: try to find country name using pycountry database.
        Only used when geocoding fails.
        """
        if not GEOCODING_AVAILABLE:
            return None

        location_clean = self.clean_location(location)
        location_upper = location_clean.upper()
        location_lower = location_clean.lower()

        # Try exact country code match
        try:
            for country in pycountry.countries:
                # Check ISO codes
                if (country.alpha_2 == location_upper or
                    country.alpha_3 == location_upper):
                    return country.name

                # Check country name (case insensitive)
                if country.name.lower() in location_lower:
                    return country.name
        except:
            pass

        return None

    def process_location(self, location: str) -> Dict:
        """
        Main processing function.
        Strategy:
        1. Filter invalid locations
        2. Clean the text
        3. Try geocoding (if enabled)
        4. Fall back to country extraction
        5. Keep as-is if nothing else works
        """
        # Initialize result
        result = {
            'loc_is_valid': False,
            'loc_normalized': None,
            'loc_city': None,
            'loc_state': None,
            'loc_country': None,
            'loc_country_code': None,
            'loc_latitude': None,
            'loc_longitude': None,
            'loc_confidence': 'invalid',
            'loc_method': 'filtered'
        }

        # Step 1: Filter invalid
        if self.is_invalid(location):
            return result

        # Step 2: Clean
        cleaned = self.clean_location(location)
        if not cleaned:
            return result

        # Mark as valid (we'll fill in details)
        result['loc_is_valid'] = True
        result['loc_normalized'] = cleaned

        # Step 3: Try geocoding
        if self.use_geocoding:
            geocoded = self.geocode_location(cleaned)
            if geocoded:
                result.update({
                    'loc_normalized': geocoded['formatted'],
                    'loc_city': geocoded['city'],
                    'loc_state': geocoded['state'],
                    'loc_country': geocoded['country'],
                    'loc_country_code': geocoded['country_code'],
                    'loc_latitude': geocoded['latitude'],
                    'loc_longitude': geocoded['longitude'],
                    'loc_confidence': 'high',
                    'loc_method': 'geocoded'
                })
                return result

        # Step 4: Fallback - extract country only
        country = self.extract_country_fallback(cleaned)
        if country:
            result.update({
                'loc_country': country,
                'loc_confidence': 'medium',
                'loc_method': 'country_extracted'
            })
            return result

        # Step 5: Keep as-is but mark as low confidence
        result.update({
            'loc_confidence': 'low',
            'loc_method': 'normalized_only'
        })

        return result


def process_dataframe(
    df: pd.DataFrame,
    location_col: str = 'userLocation',
    use_geocoding: bool = True,
    rate_limit_delay: float = 0.05,
    batch_size: Optional[int] = None
) -> pd.DataFrame:
    """
    Process entire dataframe with smart location extraction.

    Args:
        df: DataFrame with Twitter data
        location_col: Name of location column
        use_geocoding: Use geocoding API (slower but much more accurate)
        rate_limit_delay: Seconds to wait between geocoding requests
        batch_size: Process only N rows (for testing)

    Returns:
        DataFrame with additional location columns
    """
    extractor = SmartLocationExtractor(use_geocoding=use_geocoding)

    # Sample if requested
    if batch_size:
        df_to_process = df.head(batch_size).copy()
        print(f"Processing sample of {batch_size} rows...")
    else:
        df_to_process = df.copy()
        print(f"Processing {len(df_to_process)} rows...")

    if use_geocoding and GEOCODING_AVAILABLE:
        print(f"⚠️  Geocoding is slow! Expect ~{len(df_to_process) * rate_limit_delay / 60:.1f} minutes")
        print("   Consider: process_dataframe(df, use_geocoding=False) for faster results")

    # Process each location
    results_list = []

    for idx, row in df_to_process.iterrows():
        location = row.get(location_col)
        result = extractor.process_location(location)
        results_list.append(result)

        # Rate limiting for geocoding
        if use_geocoding and not extractor.is_invalid(location):
            time.sleep(rate_limit_delay)

        if (idx + 1) % 100 == 0:
            print(f"  Processed {idx + 1}/{len(df_to_process)} rows...")

    # Convert results to DataFrame
    results_df = pd.DataFrame(results_list)

    # Add location columns to original dataframe
    df_output = df_to_process.copy()
    for col in results_df.columns:
        df_output[col] = results_df[col].values

    # Summary
    valid_count = df_output['loc_is_valid'].sum()
    high_conf = (df_output['loc_confidence'] == 'high').sum()
    with_coords = df_output['loc_latitude'].notna().sum()

    print(f"\n✓ Complete!")
    print(f"  Valid locations: {valid_count}/{len(df_output)} ({100*valid_count/len(df_output):.1f}%)")
    print(f"  High confidence: {high_conf} ({100*high_conf/len(df_output):.1f}%)")
    if use_geocoding:
        print(f"  With coordinates: {with_coords} ({100*with_coords/len(df_output):.1f}%)")

    return df_output


def process_with_caching(
    df: pd.DataFrame,
    location_col: str = 'userLocation',
    cache_file: str = 'location_cache.pkl'
) -> pd.DataFrame:
    """
    Process with caching to avoid re-geocoding same locations.
    Recommended for large datasets!
    """
    import pickle
    import os

    # Load cache if exists
    cache = {}
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            cache = pickle.load(f)
        print(f"✓ Loaded cache with {len(cache)} locations")

    extractor = SmartLocationExtractor(use_geocoding=True)

    # Get unique locations
    unique_locs = df[location_col].dropna().unique()
    print(f"Processing {len(unique_locs)} unique locations...")

    # Process only uncached locations
    new_count = 0
    for loc in unique_locs:
        if loc not in cache:
            result = extractor.process_location(loc)
            cache[loc] = result
            new_count += 1
            time.sleep(0.05)  # Rate limiting

            if new_count % 50 == 0:
                print(f"  Geocoded {new_count} new locations...")

    print(f"✓ Geocoded {new_count} new locations (had {len(unique_locs)-new_count} cached)")

    # Save cache
    with open(cache_file, 'wb') as f:
        pickle.dump(cache, f)
    print(f"✓ Saved cache to {cache_file}")

    # Apply cached results to dataframe
    results_list = [cache.get(loc, {
        'loc_is_valid': False,
        'loc_confidence': 'invalid'
    }) for loc in df[location_col]]

    results_df = pd.DataFrame(results_list)
    df_output = df.copy()
    for col in results_df.columns:
        df_output[col] = results_df[col].values

    return df_output


# ============================================================================
# USAGE EXAMPLES
# ============================================================================

if __name__ == "__main__":
    print("="*70)
    print("SMART LOCATION EXTRACTION - THREE APPROACHES")
    print("="*70)

    # Test data
    test_data = pd.DataFrame({
        'userId': [1, 2, 3, 4, 5, 6],
        'userLocation': [
            'Chicagoland mostly',      # Informal
            'Durham, N.C.',            # State abbreviation
            'Kingdom of Saudi Arabia', # Formal country name
            'South of England',        # Directional
            'International Website',   # Invalid
            'Chiang Mai'               # Foreign city
        ]
    })

    print("\n" + "="*70)
    print("APPROACH 1: Fast (no geocoding)")
    print("="*70)
    result1 = process_dataframe(test_data, use_geocoding=False)
    print("\nResults:")
    print(result1[['userLocation', 'loc_is_valid', 'loc_normalized',
                   'loc_confidence']].to_string(index=False))

    if GEOCODING_AVAILABLE:
        print("\n" + "="*70)
        print("APPROACH 2: Accurate (with geocoding)")
        print("="*70)
        result2 = process_dataframe(test_data, use_geocoding=True)
        print("\nResults:")
        print(result2[['userLocation', 'loc_city', 'loc_country',
                       'loc_confidence']].to_string(index=False))

        print("\n" + "="*70)
        print("APPROACH 3: Large datasets (with caching)")
        print("="*70)
        print("""
# Recommended for datasets with >10k rows:
df_processed = process_with_caching(
    df,
    location_col='userLocation',
    cache_file='my_location_cache.pkl'
)

# Re-running will use cached results - much faster!
        """)

    print("\n" + "="*70)
    print("RECOMMENDATION FOR YOUR H5N1 DATA:")
    print("="*70)
    print("""
Given you likely have 100k+ rows with many duplicate locations:

1. START WITH: Fast mode to understand your data
   df_test = process_dataframe(df.head(1000), use_geocoding=False)

2. THEN: Use caching for full dataset
   df_full = process_with_caching(df, cache_file='h5n1_locations.pkl')

3. BENEFIT: Geocodes ~50k unique locations once, then instant for all rows!
    """)

SMART LOCATION EXTRACTION - THREE APPROACHES

APPROACH 1: Fast (no geocoding)
⚠️  Geocoding disabled - using pattern matching only
Processing 6 rows...

✓ Complete!
  Valid locations: 5/6 (83.3%)
  High confidence: 0 (0.0%)

Results:
           userLocation  loc_is_valid     loc_normalized loc_confidence
     Chicagoland mostly          True Chicagoland mostly            low
           Durham, N.C.          True       Durham, N.C.            low
Kingdom of Saudi Arabia          True       Saudi Arabia         medium
       South of England          True   South of England            low
  International Website         False               None        invalid
             Chiang Mai          True         Chiang Mai            low

APPROACH 2: Accurate (with geocoding)
✓ Geocoding enabled (OpenStreetMap/Nominatim)
Processing 6 rows...
⚠️  Geocoding is slow! Expect ~0.0 minutes
   Consider: process_dataframe(df, use_geocoding=False) for faster results

✓ Complete!
  Valid locations: 5/6 (8

In [47]:
# Install once
# pip install geopy pycountry

# Fast test (no geocoding)
df_test = process_dataframe(twitter.head(100), use_geocoding=False)

# Full dataset with caching (recommended)
df_full = process_with_caching(
    twitter,
    location_col='userLocation',
    cache_file='h5n1_locations.pkl'
)

# Re-run anytime - uses cached results!

⚠️  Geocoding disabled - using pattern matching only
Processing 100 rows...
  Processed 20800/100 rows...

✓ Complete!
  Valid locations: 42/100 (42.0%)
  High confidence: 0 (0.0%)
✓ Loaded cache with 39 locations
✓ Geocoding enabled (OpenStreetMap/Nominatim)
Processing 14761 unique locations...
  Geocoded 50 new locations...
  Geocoded 100 new locations...
  Geocoded 150 new locations...
  Geocoded 200 new locations...
  Geocoded 250 new locations...
  Geocoded 300 new locations...
  Geocoded 350 new locations...




  Geocoded 400 new locations...
  Geocoded 450 new locations...




  Geocoded 500 new locations...
  Geocoded 550 new locations...
  Geocoded 600 new locations...
  Geocoded 650 new locations...
  Geocoded 700 new locations...
  Geocoded 750 new locations...
  Geocoded 800 new locations...
  Geocoded 850 new locations...
  Geocoded 900 new locations...
  Geocoded 950 new locations...
  Geocoded 1000 new locations...




  Geocoded 1050 new locations...
  Geocoded 1100 new locations...




  Geocoded 1150 new locations...
  Geocoded 1200 new locations...
  Geocoded 1250 new locations...
  Geocoded 1300 new locations...
  Geocoded 1350 new locations...
  Geocoded 1400 new locations...
  Geocoded 1450 new locations...
  Geocoded 1500 new locations...




  Geocoded 1550 new locations...




  Geocoded 1600 new locations...




  Geocoded 1650 new locations...
  Geocoded 1700 new locations...




  Geocoded 1750 new locations...
  Geocoded 1800 new locations...
  Geocoded 1850 new locations...
  Geocoded 1900 new locations...
  Geocoded 1950 new locations...
  Geocoded 2000 new locations...
  Geocoded 2050 new locations...




  Geocoded 2100 new locations...
  Geocoded 2150 new locations...
  Geocoded 2200 new locations...




  Geocoded 2250 new locations...
  Geocoded 2300 new locations...
  Geocoded 2350 new locations...




  Geocoded 2400 new locations...




  Geocoded 2450 new locations...
  Geocoded 2500 new locations...
  Geocoded 2550 new locations...
  Geocoded 2600 new locations...
  Geocoded 2650 new locations...
  Geocoded 2700 new locations...
  Geocoded 2750 new locations...




  Geocoded 2800 new locations...
  Geocoded 2850 new locations...
  Geocoded 2900 new locations...
  Geocoded 2950 new locations...
  Geocoded 3000 new locations...
  Geocoded 3050 new locations...




  Geocoded 3100 new locations...




  Geocoded 3150 new locations...
  Geocoded 3200 new locations...
  Geocoded 3250 new locations...




  Geocoded 3300 new locations...
  Geocoded 3350 new locations...
  Geocoded 3400 new locations...
  Geocoded 3450 new locations...
  Geocoded 3500 new locations...
  Geocoded 3550 new locations...
  Geocoded 3600 new locations...




  Geocoded 3650 new locations...




  Geocoded 3700 new locations...
  Geocoded 3750 new locations...




  Geocoded 3800 new locations...




  Geocoded 3850 new locations...
  Geocoded 3900 new locations...
  Geocoded 3950 new locations...




  Geocoded 4000 new locations...




  Geocoded 4050 new locations...
  Geocoded 4100 new locations...
  Geocoded 4150 new locations...




  Geocoded 4200 new locations...




  Geocoded 4250 new locations...
  Geocoded 4300 new locations...




  Geocoded 4350 new locations...
  Geocoded 4400 new locations...
  Geocoded 4450 new locations...




  Geocoded 4500 new locations...
  Geocoded 4550 new locations...




  Geocoded 4600 new locations...
  Geocoded 4650 new locations...




  Geocoded 4700 new locations...
  Geocoded 4750 new locations...




  Geocoded 4800 new locations...
  Geocoded 4850 new locations...
  Geocoded 4900 new locations...
  Geocoded 4950 new locations...
  Geocoded 5000 new locations...
  Geocoded 5050 new locations...




  Geocoded 5100 new locations...




  Geocoded 5150 new locations...
  Geocoded 5200 new locations...
  Geocoded 5250 new locations...
  Geocoded 5300 new locations...




  Geocoded 5350 new locations...




  Geocoded 5400 new locations...
  Geocoded 5450 new locations...




  Geocoded 5500 new locations...
  Geocoded 5550 new locations...
  Geocoded 5600 new locations...




  Geocoded 5650 new locations...




  Geocoded 5700 new locations...




  Geocoded 5750 new locations...
  Geocoded 5800 new locations...
  Geocoded 5850 new locations...




  Geocoded 5900 new locations...




  Geocoded 5950 new locations...




  Geocoded 6000 new locations...




  Geocoded 6050 new locations...




  Geocoded 6100 new locations...
  Geocoded 6150 new locations...
  Geocoded 6200 new locations...
  Geocoded 6250 new locations...
  Geocoded 6300 new locations...
  Geocoded 6350 new locations...




  Geocoded 6400 new locations...
  Geocoded 6450 new locations...
  Geocoded 6500 new locations...
  Geocoded 6550 new locations...




  Geocoded 6600 new locations...




  Geocoded 6650 new locations...




  Geocoded 6700 new locations...




  Geocoded 6750 new locations...
  Geocoded 6800 new locations...
  Geocoded 6850 new locations...




  Geocoded 6900 new locations...




  Geocoded 6950 new locations...




  Geocoded 7000 new locations...




  Geocoded 7050 new locations...
  Geocoded 7100 new locations...
  Geocoded 7150 new locations...
  Geocoded 7200 new locations...
  Geocoded 7250 new locations...
  Geocoded 7300 new locations...
  Geocoded 7350 new locations...
  Geocoded 7400 new locations...
  Geocoded 7450 new locations...
  Geocoded 7500 new locations...




  Geocoded 7550 new locations...
  Geocoded 7600 new locations...
  Geocoded 7650 new locations...




  Geocoded 7700 new locations...




  Geocoded 7750 new locations...
  Geocoded 7800 new locations...




  Geocoded 7850 new locations...
  Geocoded 7900 new locations...
  Geocoded 7950 new locations...
  Geocoded 8000 new locations...




  Geocoded 8050 new locations...
  Geocoded 8100 new locations...




  Geocoded 8150 new locations...
  Geocoded 8200 new locations...
  Geocoded 8250 new locations...
  Geocoded 8300 new locations...




  Geocoded 8350 new locations...
  Geocoded 8400 new locations...
  Geocoded 8450 new locations...
  Geocoded 8500 new locations...
  Geocoded 8550 new locations...




  Geocoded 8600 new locations...




  Geocoded 8650 new locations...
  Geocoded 8700 new locations...
  Geocoded 8750 new locations...




  Geocoded 8800 new locations...
  Geocoded 8850 new locations...




  Geocoded 8900 new locations...
  Geocoded 8950 new locations...
  Geocoded 9000 new locations...
  Geocoded 9050 new locations...




  Geocoded 9100 new locations...
  Geocoded 9150 new locations...
  Geocoded 9200 new locations...




  Geocoded 9250 new locations...
  Geocoded 9300 new locations...




  Geocoded 9350 new locations...
  Geocoded 9400 new locations...




  Geocoded 9450 new locations...
  Geocoded 9500 new locations...
  Geocoded 9550 new locations...
  Geocoded 9600 new locations...




  Geocoded 9650 new locations...
  Geocoded 9700 new locations...




  Geocoded 9750 new locations...
  Geocoded 9800 new locations...
  Geocoded 9850 new locations...
  Geocoded 9900 new locations...
  Geocoded 9950 new locations...
  Geocoded 10000 new locations...
  Geocoded 10050 new locations...




  Geocoded 10100 new locations...
  Geocoded 10150 new locations...




  Geocoded 10200 new locations...
  Geocoded 10250 new locations...
  Geocoded 10300 new locations...
  Geocoded 10350 new locations...
  Geocoded 10400 new locations...
  Geocoded 10450 new locations...




  Geocoded 10500 new locations...




  Geocoded 10550 new locations...
  Geocoded 10600 new locations...
  Geocoded 10650 new locations...




  Geocoded 10700 new locations...
  Geocoded 10750 new locations...
  Geocoded 10800 new locations...




  Geocoded 10850 new locations...
  Geocoded 10900 new locations...
  Geocoded 10950 new locations...




  Geocoded 11000 new locations...




  Geocoded 11050 new locations...
  Geocoded 11100 new locations...
  Geocoded 11150 new locations...




  Geocoded 11200 new locations...
  Geocoded 11250 new locations...
  Geocoded 11300 new locations...




  Geocoded 11350 new locations...
  Geocoded 11400 new locations...
  Geocoded 11450 new locations...




  Geocoded 11500 new locations...
  Geocoded 11550 new locations...
  Geocoded 11600 new locations...




  Geocoded 11650 new locations...
  Geocoded 11700 new locations...
  Geocoded 11750 new locations...




  Geocoded 11800 new locations...
  Geocoded 11850 new locations...




  Geocoded 11900 new locations...
  Geocoded 11950 new locations...
  Geocoded 12000 new locations...
  Geocoded 12050 new locations...
  Geocoded 12100 new locations...




  Geocoded 12150 new locations...




  Geocoded 12200 new locations...
  Geocoded 12250 new locations...




  Geocoded 12300 new locations...
  Geocoded 12350 new locations...




  Geocoded 12400 new locations...
  Geocoded 12450 new locations...




  Geocoded 12500 new locations...
  Geocoded 12550 new locations...
  Geocoded 12600 new locations...




  Geocoded 12650 new locations...
  Geocoded 12700 new locations...




  Geocoded 12750 new locations...
  Geocoded 12800 new locations...




  Geocoded 12850 new locations...
  Geocoded 12900 new locations...
  Geocoded 12950 new locations...
  Geocoded 13000 new locations...




  Geocoded 13050 new locations...




  Geocoded 13100 new locations...
  Geocoded 13150 new locations...




  Geocoded 13200 new locations...
  Geocoded 13250 new locations...
  Geocoded 13300 new locations...




  Geocoded 13350 new locations...
  Geocoded 13400 new locations...




  Geocoded 13450 new locations...




  Geocoded 13500 new locations...




  Geocoded 13550 new locations...
  Geocoded 13600 new locations...




  Geocoded 13650 new locations...




  Geocoded 13700 new locations...
  Geocoded 13750 new locations...
  Geocoded 13800 new locations...
  Geocoded 13850 new locations...
  Geocoded 13900 new locations...




  Geocoded 13950 new locations...
  Geocoded 14000 new locations...




  Geocoded 14050 new locations...
  Geocoded 14100 new locations...
  Geocoded 14150 new locations...




  Geocoded 14200 new locations...
  Geocoded 14250 new locations...




  Geocoded 14300 new locations...




  Geocoded 14350 new locations...




  Geocoded 14400 new locations...




  Geocoded 14450 new locations...




  Geocoded 14500 new locations...




  Geocoded 14550 new locations...
  Geocoded 14600 new locations...
  Geocoded 14650 new locations...
  Geocoded 14700 new locations...
✓ Geocoded 14722 new locations (had 39 cached)
✓ Saved cache to h5n1_locations.pkl


In [49]:
df_full.to_csv('twitter_geocoded.csv', index=False)

In [50]:
"""
Incremental Geocoding - Add new locations to existing cache
Simple script to geocode only new locations you haven't seen before
"""

import pickle
import pandas as pd
import time
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import re


def load_cache(cache_file='location_cache.pkl'):
    """Load existing cache or create empty one"""
    try:
        with open(cache_file, 'rb') as f:
            cache = pickle.load(f)
        print(f"✓ Loaded existing cache with {len(cache)} locations")
        return cache
    except FileNotFoundError:
        print("✓ Creating new cache")
        return {}


def is_invalid_location(location):
    """Quick filter for obviously invalid locations"""
    if not location or pd.isna(location) or str(location).strip() == '':
        return True

    location_lower = str(location).lower().strip()

    # Filter obvious junk
    invalid_patterns = [
        r'\b(everywhere|nowhere|internet|online|cyberspace|worldwide)\b',
        r'\b(on-chain|in the club|web3)\b',
        r'international website'
    ]

    for pattern in invalid_patterns:
        if re.search(pattern, location_lower, re.IGNORECASE):
            return True

    return False


def geocode_location(location, geolocator):
    """Geocode a single location"""
    try:
        result = geolocator.geocode(location, addressdetails=True, language='en')

        if result:
            address = result.raw.get('address', {})
            return {
                'loc_is_valid': True,
                'loc_normalized': result.address,
                'loc_city': address.get('city') or address.get('town') or address.get('village'),
                'loc_state': address.get('state') or address.get('region'),
                'loc_country': address.get('country'),
                'loc_country_code': address.get('country_code', '').upper(),
                'loc_latitude': result.latitude,
                'loc_longitude': result.longitude,
                'loc_confidence': 'high',
                'loc_method': 'geocoded'
            }
    except (GeocoderTimedOut, GeocoderServiceError):
        pass
    except Exception:
        pass

    # Failed to geocode
    return {
        'loc_is_valid': False,
        'loc_normalized': None,
        'loc_city': None,
        'loc_state': None,
        'loc_country': None,
        'loc_country_code': None,
        'loc_latitude': None,
        'loc_longitude': None,
        'loc_confidence': 'invalid',
        'loc_method': 'geocoding_failed'
    }


def process_new_data(df, cache_file='location_cache.pkl', location_col='userLocation'):
    """
    Process new Twitter data using existing cache + geocoding for new locations.

    Args:
        df: New DataFrame with Twitter data
        cache_file: Path to existing cache file
        location_col: Column containing locations

    Returns:
        DataFrame with location columns added
    """
    print("="*70)
    print("INCREMENTAL GEOCODING")
    print("="*70)

    # Load existing cache
    cache = load_cache(cache_file)
    initial_cache_size = len(cache)

    # Find unique locations in new data
    unique_locations = df[location_col].dropna().unique()
    print(f"\nFound {len(unique_locations)} unique locations in new data")

    # Identify which are already cached
    cached_locations = [loc for loc in unique_locations if loc in cache]
    new_locations = [loc for loc in unique_locations if loc not in cache]

    print(f"  Already cached: {len(cached_locations)}")
    print(f"  Need to geocode: {len(new_locations)}")

    # Geocode only new locations
    if new_locations:
        print(f"\nGeocoding {len(new_locations)} new locations...")
        print(f"Estimated time: ~{len(new_locations) * 0.05 / 60:.1f} minutes\n")

        geolocator = Nominatim(user_agent="twitter_location_extractor", timeout=3)

        for i, location in enumerate(new_locations, 1):
            # Skip invalid locations
            if is_invalid_location(location):
                cache[location] = {
                    'loc_is_valid': False,
                    'loc_confidence': 'invalid',
                    'loc_method': 'filtered'
                }
                continue

            # Geocode
            result = geocode_location(location, geolocator)
            cache[location] = result

            # Progress
            if i % 10 == 0:
                print(f"  Geocoded {i}/{len(new_locations)} new locations...")

            # Rate limiting (Nominatim requires ~1 request/sec)
            time.sleep(0.05)

            # Save periodically (every 100 locations)
            if i % 100 == 0:
                with open(cache_file, 'wb') as f:
                    pickle.dump(cache, f)
                print(f"  ✓ Saved checkpoint at {i} locations")

        # Final save
        with open(cache_file, 'wb') as f:
            pickle.dump(cache, f)

        print(f"\n✓ Geocoded and cached {len(new_locations)} new locations")
        print(f"✓ Updated cache: {initial_cache_size} → {len(cache)} locations")
    else:
        print("\n✓ All locations already cached - no geocoding needed!")

    # Apply cache to dataframe
    print(f"\nApplying cached results to {len(df)} rows...")

    df_output = df.copy()
    df_output['loc_city'] = df[location_col].map(lambda x: cache.get(x, {}).get('loc_city'))
    df_output['loc_state'] = df[location_col].map(lambda x: cache.get(x, {}).get('loc_state'))
    df_output['loc_country'] = df[location_col].map(lambda x: cache.get(x, {}).get('loc_country'))
    df_output['loc_country_code'] = df[location_col].map(lambda x: cache.get(x, {}).get('loc_country_code'))
    df_output['loc_latitude'] = df[location_col].map(lambda x: cache.get(x, {}).get('loc_latitude'))
    df_output['loc_longitude'] = df[location_col].map(lambda x: cache.get(x, {}).get('loc_longitude'))
    df_output['loc_confidence'] = df[location_col].map(lambda x: cache.get(x, {}).get('loc_confidence', 'not_in_cache'))
    df_output['loc_normalized'] = df[location_col].map(lambda x: cache.get(x, {}).get('loc_normalized'))

    # Summary
    valid_count = (df_output['loc_confidence'].isin(['high', 'medium'])).sum()
    print(f"\n✓ Complete!")
    print(f"  Valid locations: {valid_count}/{len(df)} ({100*valid_count/len(df):.1f}%)")
    print(f"  Cache now contains: {len(cache)} total locations")
    print(f"  Added {len(cache) - initial_cache_size} new locations to cache")

    return df_output


# ============================================================================
# SIMPLE USAGE
# ============================================================================

if __name__ == "__main__":

    print("""
USAGE EXAMPLE:

# First time - process your initial data
df1 = pd.read_csv('h5n1_batch1.csv')
df1_processed = process_new_data(df1, cache_file='h5n1_locations.pkl')
df1_processed.to_csv('h5n1_batch1_with_locations.csv', index=False)

# Later - process new data (reuses cache, only geocodes new locations)
df2 = pd.read_csv('h5n1_batch2.csv')
df2_processed = process_new_data(df2, cache_file='h5n1_locations.pkl')
df2_processed.to_csv('h5n1_batch2_with_locations.csv', index=False)

# Even later - more new data (keeps building the cache)
df3 = pd.read_csv('h5n1_batch3.csv')
df3_processed = process_new_data(df3, cache_file='h5n1_locations.pkl')
df3_processed.to_csv('h5n1_batch3_with_locations.csv', index=False)

The cache grows automatically and you never geocode the same location twice!
    """)

    # Example with sample data
    print("\n" + "="*70)
    print("EXAMPLE RUN:")
    print("="*70)

    # Simulate first batch
    df_batch1 = pd.DataFrame({
        'userId': [1, 2, 3],
        'userLocation': ['Chicago, IL', 'London', 'Tokyo']
    })

    print("\n--- Processing Batch 1 ---")
    df1_result = process_new_data(df_batch1, cache_file='example_cache.pkl')
    print("\nResult preview:")
    print(df1_result[['userLocation', 'loc_city', 'loc_country']].to_string(index=False))

    # Simulate second batch with some overlapping locations
    df_batch2 = pd.DataFrame({
        'userId': [4, 5, 6],
        'userLocation': ['Chicago, IL', 'Paris', 'Berlin']  # Chicago already cached!
    })

    print("\n\n--- Processing Batch 2 ---")
    df2_result = process_new_data(df_batch2, cache_file='example_cache.pkl')
    print("\nResult preview:")
    print(df2_result[['userLocation', 'loc_city', 'loc_country']].to_string(index=False))

    print("\n" + "="*70)
    print("Notice: 'Chicago, IL' wasn't geocoded again - it used the cache!")
    print("="*70)


USAGE EXAMPLE:
    
# First time - process your initial data
df1 = pd.read_csv('h5n1_batch1.csv')
df1_processed = process_new_data(df1, cache_file='h5n1_locations.pkl')
df1_processed.to_csv('h5n1_batch1_with_locations.csv', index=False)

# Later - process new data (reuses cache, only geocodes new locations)
df2 = pd.read_csv('h5n1_batch2.csv')
df2_processed = process_new_data(df2, cache_file='h5n1_locations.pkl')
df2_processed.to_csv('h5n1_batch2_with_locations.csv', index=False)

# Even later - more new data (keeps building the cache)
df3 = pd.read_csv('h5n1_batch3.csv')
df3_processed = process_new_data(df3, cache_file='h5n1_locations.pkl')
df3_processed.to_csv('h5n1_batch3_with_locations.csv', index=False)

The cache grows automatically and you never geocode the same location twice!
    

EXAMPLE RUN:

--- Processing Batch 1 ---
INCREMENTAL GEOCODING
✓ Creating new cache

Found 3 unique locations in new data
  Already cached: 0
  Need to geocode: 3

Geocoding 3 new locations...
Esti