# Code for Adding Latitude and Longitude and Mapping Reprints

The following notebook enriches lynch_clusters with longitude and latitude data for the cities of their reports. I also put together interactive maps for the clusters. These maps show location of reprints, the newspaper and its city, the victim name, and the clippings.

Steps in this notebook:
1) Review Viral Texts' place metadata.
2) Get sn codes for lynch_clusters (extracted from 'URL' column)
3) Match sn codes with instances in 'series.csv' then extract 'coverage' data
4) Match coverage with instances in 'places.csv' then extract latitudinal and longitudanl data
5) Map the data with folium.
6) Display the data based on user selection using ipywidgets.

In [None]:
import re
import pandas as pd
import os
import folium
from folium.plugins import MarkerCluster
import webbrowser
import ipywidgets as widgets
from IPython.display import display

1) review Viral Texts' place metadata:

In [None]:
lat_long_df = pd.read_csv('https://raw.githubusercontent.com/ViralTexts/newspaper-metadata/main/places.csv')

In [None]:
series_df = pd.read_csv('https://raw.githubusercontent.com/ViralTexts/newspaper-metadata/main/series.csv')

In [None]:
lat_long_df.head()

In [None]:
series_df.head()

2) Get sn codes for lynch_clusters (extracted from 'URL' column):

In [None]:
# extract sn codes from URL column

directory = '/Users/matthewkollmer/PycharmProjects/nlp_research/vrt_work/lynch_clusters_02_refined'

sn_code_regex = re.compile(r'sn\d{8}')

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
        
    df = pd.read_csv(file_path)
        
    sn_codes = []
        
    for url in df['URL']:
        if pd.isna(url):
            sn_codes.append(None)
                
        else:
            sn_match = sn_code_regex.search(url)
                
            if sn_match:
                sn_codes.append(sn_match.group(0))
                    
            else:
                sn_codes.append(None)
        
    df['sn_code'] = sn_codes
    
    df['sn_code'] = df['sn_code'].fillna('')
    df['sn_code'] = df['sn_code'].apply(lambda x: '/lccn/' + x if x else None)
        
    df.to_csv(file_path, index=False)
    print(f'Updated {filename} with sn codes.')

3) Match sn codes with instances in 'series.csv' then extract 'coverage' data:

In [None]:
# match sn codes between 'series' dataframe and victim csv files

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
        
    df = pd.read_csv(file_path)
        
    df['coverage'] = None
        
    for i, sn_code in series_df['series'].items():

        matching_sn_codes = df['sn_code'] == sn_code
            
        df.loc[matching_sn_codes, 'coverage'] = series_df.loc[i, 'coverage']
        
    df.to_csv(file_path, index=False)
    print(f'Updated {filename} with coverage links.')

In [None]:
# add reprint date columns

date_regex = re.compile(r'\d{4}-\d{2}-\d{2}')

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
        
    df = pd.read_csv(file_path)
    
    reprint_dates = []
    
    for url in df['URL']:
        if pd.isna(url):
            reprint_dates.append(None)
        else:
            date_match = date_regex.search(url)
            
            if date_match:
                reprint_dates.append(date_match.group(0))
            else:
                reprint_dates.append(None)
    
    df['reprint_date'] = reprint_dates
    
    df['reprint_date'] = pd.to_datetime(df['reprint_date'], format='%Y-%m-%d', errors='coerce')
    
    df.to_csv(file_path, index=False)
    
    print(f'Updated {filename} with reprint dates.')

In [None]:
# add Newspaper Title column

paper_title_regex = re.compile(r'^[\w*\s.]*\w')

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    
    df = pd.read_csv(file_path)
    
    paper_titles = []
    
    for url in df['Link Title']:
        if pd.isna(url):
            paper_titles.append(None)
        else:
            paper_match = paper_title_regex.search(url)
            
            if paper_match:
                paper_titles.append(paper_match.group(0))
            else:
                paper_titles.append(None)
    
    df['newspaper'] = paper_titles
    
    df.to_csv(file_path, index=False)
    
    print(f'Updated {filename} with newspaper titles.')

In [None]:
test_df = pd.read_csv('/Users/matthewkollmer/PycharmProjects/nlp_research/vrt_work/lynch_clusters_02_refined/aaron_thomas.csv')

test_df

4) Match coverage with instances in 'places.csv' then extract latitudinal and longitudanl data:

In [None]:
# match 'coverage' column in victim csv files to 'coverage' column in 'places.csv'/lat_long_df and transfer lat/long data to victim csv files

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
        
    df = pd.read_csv(file_path)
        
    df['longitude'] = None
    df['latitude'] = None
        
    for i, coverage_link in lat_long_df['coverage'].items():
   
        matching_rows = df['coverage'] == coverage_link

        df.loc[matching_rows, 'longitude'] = lat_long_df.loc[i, 'lon']
        df.loc[matching_rows, 'latitude'] = lat_long_df.loc[i, 'lat']
        
    df.to_csv(file_path, index=False)
    print(f'Updated {filename} with longitude and latitude data.')

5) Map the data with folium.

I spent way too much time on this. But it works! And it's customized so the cluster icons are dark red.

In [None]:
output_directory = '/Users/matthewkollmer/PycharmProjects/nlp_research/vrt_work/lynch_cluster_maps/'

def build_maps(file_path):
    file_name = os.path.basename(file_path)
    
    df = pd.read_csv(file_path)
    
    df = df.dropna(subset=['latitude', 'longitude'])
    
    map_center = [39.8283, -98.5795]  # this is roughly the central location of the USA. Googled it.
    
    folium_map = folium.Map(location=map_center, zoom_start=3, tiles="CartoDB positron")

    # I don't actually know javascript. Thanks ChatGPT!
    marker_cluster = MarkerCluster(
        icon_create_function="""
        function (cluster) {
            var count = cluster.getChildCount();
            var size = 'small';
            var iconSize = L.point(1, 1);
            
            var color = 'rgba(139, 0, 0,';
            
            if (count < 4) {
                color += '.4)';
            } else if (count < 10) {
                color += '.6)';
            } else {
                color += '1)'; 
            }
            
            return L.divIcon({
                html: '<div style="background-color:' + color + '; color:white;"><span style="color:white;">' + count + '</span></div>',
                className: 'marker-cluster marker-cluster-' + size,
                iconSize: iconSize
            });
        }
        """
    ).add_to(folium_map)
    
    for _, row in df.iterrows():
        latitude = row['latitude']
        longitude = row['longitude']
        victim = row['victim']
        newspaper = row['newspaper']
        reprint_date = row['reprint_date']
        url = row['URL']
        clipping = row['clippings']
        
        url_hyperlink = f'<a href="{url}" target="_blank">Read the full page</a>'
        
        popup_text = f'Victim: {victim}<br><br>Newspaper: {newspaper}<br><br>Reprint Date: {reprint_date}<br><br>{url_hyperlink}<br><br>Newspaper Clipping: <br><br>{clipping}'
        
        folium.CircleMarker(
            location=[latitude, longitude],
            radius=4,
            color='darkred',
            popup=folium.Popup(popup_text, max_width=300)
        ).add_to(marker_cluster)
    
    output_html = os.path.join(output_directory, file_name.replace('.csv', '.html'))
    folium_map.save(output_html)
    print(f'{file_name} map saved to {output_html}')

files = [f for f in os.listdir(directory) if f.endswith('.csv')]

for file_name in files:
    file_path = os.path.join(directory, file_name)
    build_maps(file_path)

6) Display the data based on user selection using ipywidgets.

In [None]:
map_directory = '/Users/matthewkollmer/PycharmProjects/nlp_research/vrt_work/lynch_cluster_maps/'

def open_map_in_browser(file_name):
    file_path = os.path.join(map_directory, file_name)
    webbrowser.open(f'file://{file_path}')

list_of_maps = [f for f in os.listdir(map_directory) if f.endswith('.html')]

dropdown = widgets.Dropdown(
    options=list_of_maps,
    description='Select Map:',
    disabled=False,
)

def open_sesame(change):
    open_map_in_browser(change['new'])

dropdown.observe(open_sesame, names='value')

display(dropdown)

In [None]:
# Iterate over all files in the map directory
for map_file_name in os.listdir(map_directory):
    # Skip non-HTML files (assuming the maps are saved as .html)
    if not map_file_name.endswith('.html'):
        continue
    
    # Print the HTML <option> tag with the actual map file name
    print(f'<option value="https://matthewkollmer.com/{map_file_name}">{map_file_name}</option>')