<a href="https://colab.research.google.com/github/NearANDfar13/NLP_Winter_2024/blob/main/NAA_San_Fran_NER_ImpressoAPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

#From NLP class on Impresso API
import requests

def get_linked_entities(text, coarse_only=False):
    """
    Calls the external API to get named entity recognition (NER) results.
    """
    url = "https://impresso-annotation.epfl.ch/api/ner/"
    payload = {"data": text}
    try:
        response = requests.post(url, json=payload)
        if response.status_code == 200:
            data = response.json()
            data["text"] = text
            # remove fine-grained and components
            if coarse_only:
                for ne in data["nes"]:
                    data["nes"] = [ne for ne in data["nes"] if not "." in ne["type"]]
            return data
        else:
            print(f"Request failed with status code {response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None


In [2]:
!git clone https://github.com/NearANDfar13/NLP_Winter_2024.git

Cloning into 'NLP_Winter_2024'...
remote: Enumerating objects: 162, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 162 (delta 4), reused 2 (delta 2), pack-reused 150 (from 1)[K
Receiving objects: 100% (162/162), 10.37 MiB | 10.74 MiB/s, done.
Resolving deltas: 100% (73/73), done.


In [7]:
#Data organizing and filtering method from NLP class
#Import the dataset "San Fran EExtended Sample"
import pandas as pd

articles_df = pd.read_excel('/content/NLP_Winter_2024/NA_San_Fran_Extended_Sample.xlsx')

#Display DataFrame

articles_df = articles_df[:10]
articles_df.head()

Unnamed: 0,ID,Day,Short_Name,Full_Name,Place,Frequency,Publication,Text,Page,Link,Datengeber
0,3043262-5,19/04/1906,Annener Zeitung,"Annener Zeitung, verbunden mit der Annener Vol...","Annen (Witten), Witten-Annen",,1885-1943,"New=York, 14 Francisco von eine gesucht. Tause...",2.0,https://www.deutsche-digitale-bibliothek.de/ne...,Westfälische Wilhelms-Universität Münster Univ...
1,3118927-1,19/04/1906,Anzeiger vom Oberland,"Anzeiger vom Oberland, Tageszeitung für das Ob...",Biberach an der Riß,täglich,1870-1932,"au- Oakland als Gerücht gemeldet, daß da- amer...",2.0,https://www.deutsche-digitale-bibliothek.de/ne...,Kreisarchiv des Landkreises Biberach
2,3118927-1,19/04/1906,Anzeiger vom Oberland,"Anzeiger vom Oberland, Tageszeitung für das Ob...",Biberach an der Riß,täglich,1870-1932,"ertappt wird, zu erschießen. ir Gf Store (CaUf...",6.0,https://www.deutsche-digitale-bibliothek.de/ne...,Kreisarchiv des Landkreises Biberach
3,2797055-3,19/04/1906,Badische Presse,Badische Presse : Generalanzeiger der Residenz...,Karlsruhe,täglich,1890-1944,Die Marinestationen am Süllen Ozean versuchten...,4.0,https://www.deutsche-digitale-bibliothek.de/ne...,Badische Landesbibliothek
4,2958285-4,19/04/1906,Der Beobachter,"Der Beobachter, ein Volksblatt aus Schwaben","Stuttgart, Württemberg",täglich,1833-1920,"Erdbeben in Ealifornien. New-Sjork, 18. April....",3.0,https://www.deutsche-digitale-bibliothek.de/ne...,Württembergische Landesbibliothek


##Importing a Dataset

## Named Entity Extraction

In [8]:
import requests
import pandas as pd
import json


def process_dataframe(df):
    results = []
    for index, row in df.iterrows():
        text = row['Text']
        api_result = get_linked_entities(text, coarse_only=True)
        results.append(api_result)
    df['NER_results'] = results
    return df

articles_df = process_dataframe(articles_df)
articles_df.head()

Unnamed: 0,ID,Day,Short_Name,Full_Name,Place,Frequency,Publication,Text,Page,Link,Datengeber,NER_results
0,3043262-5,19/04/1906,Annener Zeitung,"Annener Zeitung, verbunden mit der Annener Vol...","Annen (Witten), Witten-Annen",,1885-1943,"New=York, 14 Francisco von eine gesucht. Tause...",2.0,https://www.deutsche-digitale-bibliothek.de/ne...,Westfälische Wilhelms-Universität Münster Univ...,"{'ts': '2025-03-12T03:46:03Z', 'sys_id': 'ner-..."
1,3118927-1,19/04/1906,Anzeiger vom Oberland,"Anzeiger vom Oberland, Tageszeitung für das Ob...",Biberach an der Riß,täglich,1870-1932,"au- Oakland als Gerücht gemeldet, daß da- amer...",2.0,https://www.deutsche-digitale-bibliothek.de/ne...,Kreisarchiv des Landkreises Biberach,"{'ts': '2025-03-12T03:46:15Z', 'sys_id': 'ner-..."
2,3118927-1,19/04/1906,Anzeiger vom Oberland,"Anzeiger vom Oberland, Tageszeitung für das Ob...",Biberach an der Riß,täglich,1870-1932,"ertappt wird, zu erschießen. ir Gf Store (CaUf...",6.0,https://www.deutsche-digitale-bibliothek.de/ne...,Kreisarchiv des Landkreises Biberach,"{'ts': '2025-03-12T03:46:25Z', 'sys_id': 'ner-..."
3,2797055-3,19/04/1906,Badische Presse,Badische Presse : Generalanzeiger der Residenz...,Karlsruhe,täglich,1890-1944,Die Marinestationen am Süllen Ozean versuchten...,4.0,https://www.deutsche-digitale-bibliothek.de/ne...,Badische Landesbibliothek,"{'ts': '2025-03-12T03:46:53Z', 'sys_id': 'ner-..."
4,2958285-4,19/04/1906,Der Beobachter,"Der Beobachter, ein Volksblatt aus Schwaben","Stuttgart, Württemberg",täglich,1833-1920,"Erdbeben in Ealifornien. New-Sjork, 18. April....",3.0,https://www.deutsche-digitale-bibliothek.de/ne...,Württembergische Landesbibliothek,"{'ts': '2025-03-12T03:47:10Z', 'sys_id': 'ner-..."


In [None]:
# export as excel file
articles_df.to_excel('earthquake_articles_with_ner.xlsx', index=False)

##Extracting Locations from the NER Output - An Example

We can extract specific entities in order to use them for further analysis or visualization. Please not that when using extracted named entities for further analysis, they need to be controlled and verified by a human reader since the model most likely has made some mistakes.

In [10]:
#extract locations

import pandas as pd

def extract_locations(df):
    places = []
    for index, row in df.iterrows():
        ner_results = row['NER_results']
        if ner_results and 'nes' in ner_results:
            # Use 'surface' instead of 'text' to get the entity text
            location_names = [ne['surface'] for ne in ner_results['nes'] if ne.get('type') == 'loc']
            places.append(', '.join(location_names))  # Join multiple locations
        else:
            places.append('')  # Handle cases where NER_results is None or nes key is missing
    df['places'] = places
    return df

articles_df = extract_locations(articles_df)
print(articles_df[['Text', 'places']].head())

                                                Text  \
0  New=York, 14 Francisco von eine gesucht. Tause...   
1  au- Oakland als Gerücht gemeldet, daß da- amer...   
2  ertappt wird, zu erschießen. ir Gf Store (CaUf...   
3  Die Marinestationen am Süllen Ozean versuchten...   
4  Erdbeben in Ealifornien. New-Sjork, 18. April....   

                                              places  
0  New=York, New=York, Francisco, San Frane, Form...  
1  Oakland, Oakland, Bucht von San Franc'Sco, New...  
2  CaUfornten, Nrw, Nevada, N-w Uork, renz, renz,...  
3  Süllen Ozean, San Francisco, San Francisco, Sa...  
4  Ealifornien, New-Sjork, Kansas Cith, Los Angel...  


## Visualization Example - Creating a Map with Place Names

We first use the geopy library to process geographic locations and add their corresponding coordinates (latitude and longitude) to a pandas DataFrame. It includes a GeocodingService class that interfaces with the Nominatim geocoding API, implementing rate-limiting, retries with exponential backoff, and error handling to ensure robust geocoding.

We further use the folium library to create an interactive map with markers for locations provided in a pandas DataFrame. Finally, the map is created and displayed, providing a visual representation of the geographic data.

In [11]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
import pandas as pd
import time
from typing import List, Tuple, Optional
import random

class GeocodingService:
    def __init__(self, user_agent: str = None, timeout: int = 10, rate_limit: float = 1.1):
        """
        Initialize the geocoding service with proper configuration.

        Args:
            user_agent: Custom user agent string (default: generated)
            timeout: Timeout for requests in seconds
            rate_limit: Time to wait between requests in seconds
        """
        if user_agent is None:
            user_agent = f"python_geocoding_script_{random.randint(1000, 9999)}"

        self.geolocator = Nominatim(
            user_agent=user_agent,
            timeout=timeout
        )
        self.rate_limit = rate_limit
        self.last_request = 0

    def _rate_limit_wait(self):
        """Implement rate limiting between requests"""
        current_time = time.time()
        time_since_last = current_time - self.last_request
        if time_since_last < self.rate_limit:
            time.sleep(self.rate_limit - time_since_last)
        self.last_request = time.time()

    def geocode_location(self, location: str, max_retries: int = 3) -> Optional[Tuple[float, float]]:
        """
        Geocode a single location with retries.

        Args:
            location: Location string to geocode
            max_retries: Maximum number of retry attempts

        Returns:
            Tuple of (latitude, longitude) or None if geocoding fails
        """
        for attempt in range(max_retries):
            try:
                self._rate_limit_wait()
                location_data = self.geolocator.geocode(location)
                if location_data:
                    return (location_data.latitude, location_data.longitude)
                return None
            except (GeocoderTimedOut, GeocoderServiceError) as e:
                if attempt == max_retries - 1:
                    print(f"Failed to geocode '{location}' after {max_retries} attempts: {e}")
                    return None
                time.sleep(2 ** attempt)  # Exponential backoff
            except Exception as e:
                print(f"Error geocoding '{location}': {e}")
                return None
        return None

    def process_locations(self, locations: str) -> List[Optional[Tuple[float, float]]]:
        """
        Process a comma-separated string of locations.

        Args:
            locations: Comma-separated string of location names

        Returns:
            List of coordinate tuples or None for failed geocoding
        """
        if pd.isna(locations) or not locations:
            return []

        location_list = [loc.strip() for loc in locations.split(',')]
        return [self.geocode_location(loc) for loc in location_list]

def geolocate_places(df: pd.DataFrame,
                    places_column: str = 'places',
                    user_agent: str = None) -> pd.DataFrame:
    """
    Add coordinates to a DataFrame based on location names.

    Args:
        df: Input DataFrame
        places_column: Name of the column containing comma-separated location strings
        user_agent: Custom user agent string

    Returns:
        DataFrame with added 'coordinates' column
    """
    geocoder = GeocodingService(user_agent=user_agent)

    # Create a copy to avoid modifying the original DataFrame
    result_df = df.copy()

    # Process locations
    result_df['coordinates'] = result_df[places_column].apply(geocoder.process_locations)

    return result_df

# Main execution
if __name__ == "__main__":
    # Assuming articles_df is your DataFrame with a 'places' column
    # Apply geocoding to the articles DataFrame
    articles_df_with_coords = geolocate_places(
        articles_df,
        places_column='places',
        user_agent='article_geocoding_service_v1.0'
    )

    # Update the original DataFrame with the new coordinates
    articles_df['coordinates'] = articles_df_with_coords['coordinates']

    # Display the results
    print("\nSample of geocoded locations:")
    print(articles_df[['places', 'coordinates']].head())

    # Optional: Display some statistics
    total_locations = len(articles_df)
    successful_geocodes = articles_df['coordinates'].apply(lambda x: len([c for c in x if c is not None])).sum()
    failed_geocodes = articles_df['coordinates'].apply(lambda x: len([c for c in x if c is None])).sum()

    print(f"\nGeocoding Statistics:")
    print(f"Total locations processed: {total_locations}")
    print(f"Successfully geocoded: {successful_geocodes}")
    print(f"Failed to geocode: {failed_geocodes}")


Sample of geocoded locations:
                                              places  \
0  New=York, New=York, Francisco, San Frane, Form...   
1  Oakland, Oakland, Bucht von San Franc'Sco, New...   
2  CaUfornten, Nrw, Nevada, N-w Uork, renz, renz,...   
3  Süllen Ozean, San Francisco, San Francisco, Sa...   
4  Ealifornien, New-Sjork, Kansas Cith, Los Angel...   

                                         coordinates  
0  [(40.7127281, -74.0060152), (40.7127281, -74.0...  
1  [(37.8044557, -122.271356), (37.8044557, -122....  
2  [None, (51.4789205, 7.5543751), (39.5158825, -...  
3  [None, (37.7792588, -122.4193286), (37.7792588...  
4  [None, None, None, (34.0536909, -118.242766), ...  

Geocoding Statistics:
Total locations processed: 10
Successfully geocoded: 365
Failed to geocode: 118


In [12]:
import folium
from folium import plugins
import pandas as pd
from typing import List, Tuple, Optional
from IPython.display import display

def create_location_map(df: pd.DataFrame,
                       coordinates_col: str = 'coordinates',
                       places_col: str = 'places',
                       title_col: Optional[str] = None) -> folium.Map:
    """
    Create an interactive map with markers for all locations in the DataFrame.

    Args:
        df: DataFrame containing coordinates and place names
        coordinates_col: Name of column containing coordinates
        places_col: Name of column containing place names
        title_col: Optional column name for additional marker information

    Returns:
        folium.Map object with all locations marked
    """
    # Initialize the map
    m = folium.Map(location=[0, 0], zoom_start=2)

    # Create a MarkerCluster
    marker_cluster = plugins.MarkerCluster().add_to(m)

    # Keep track of all valid coordinates for setting bounds
    all_coords = []

    # Process each row in the DataFrame
    for idx, row in df.iterrows():
        coordinates = row[coordinates_col]
        places = row[places_col].split(',') if pd.notna(row[places_col]) else []
        title = row[title_col] if title_col and pd.notna(row[title_col]) else None

        # Skip if no coordinates
        if not coordinates:
            continue

        # Add markers for each location
        for i, (coord, place) in enumerate(zip(coordinates, places)):
            if coord is not None:  # Skip None coordinates
                lat, lon = coord
                place_name = place.strip()

                # Create popup content
                popup_content = f"<b>{place_name}</b>"
                if title:
                    popup_content += f"<br>{title}"

                # Add marker
                folium.Marker(
                    location=[lat, lon],
                    popup=popup_content,
                    tooltip=place_name
                ).add_to(marker_cluster)

                all_coords.append([lat, lon])

    # If we have coordinates, fit the map bounds to include all points
    if all_coords:
        m.fit_bounds(all_coords)

    return m

# Create and display the map
map_obj = create_location_map(articles_df)
display(map_obj)