# Sample Map
Geolocation of the samples in project: `[{{ project().name }}]`

## Description
> Map of the strains based on sampling location and visualized using folium.

In [None]:
# Load Library
import pandas as pd
import folium
from folium.features import DivIcon
import json
import re
from pathlib import Path
from jinja2 import Template
from folium.plugins import MarkerCluster

import pandas as pd
import re

def dms_to_dd(degrees, minutes, seconds, direction):
    decimal_degrees = degrees + (minutes / 60) + (seconds / 3600)
    if direction in ['S', 'W']:
        decimal_degrees *= -1
    return decimal_degrees

def parse_dms(dms_str):
    # Extract degrees, minutes, seconds, and direction
    parts = re.split('[°\'" ]+', dms_str)
    degrees = int(parts[0])
    minutes = int(parts[1])
    seconds = float(parts[2])
    direction = parts[3]
    return dms_to_dd(degrees, minutes, seconds, direction)

def convert_coordinates(coordinate_str):
    # Function to convert coordinates
    parts = re.split('[^\d\w]+', coordinate_str)
    if len(parts) == 4:
        # Assuming format: dd.dddd, dd.dddd
        lat = float(parts[0]) + float(parts[1]) / 10000
        lon = float(parts[2]) + float(parts[3]) / 10000
        return lat, lon
    elif len(parts) == 6:
        # Assuming format: dd.dddd, ddd.dddd
        lat = float(parts[0]) + float(parts[1]) / 10000
        lon = float(parts[3]) + float(parts[4]) / 10000
        return lat, lon
    elif len(parts) == 8 or len(parts) == 10 or len(parts) == 12:
        # Assuming format: DMS
        lat_str = f"{parts[0]}°{parts[1]}'{parts[2]}\" {parts[3]}"
        lon_str = f"{parts[4]}°{parts[5]}'{parts[6]}\" {parts[7]}"
        lat = parse_dms(lat_str)
        lon = parse_dms(lon_str)
        return lat, lon
    else:
        raise ValueError("Invalid GPS coordinate format")


def create_popup(row, columns, base_url):
    # Function to create popup content from a DataFrame row
    popup_content = "<table style='width:100%; border: 1px solid black; border-collapse: collapse;'>"
    for col in columns:
        if col == 'genome_id':
            # Create hyperlink for genome_id
            popup_content += f"<tr><th style='border: 1px solid black; padding: 5px;'>{col}</th>"
            popup_content += f"<td style='border: 1px solid black; padding: 5px;'><a href='{base_url}/{row[col]}' target='_blank'>{row[col]}</a></td></tr>"
        else:
            popup_content += f"<tr><th style='border: 1px solid black; padding: 5px;'>{col}</th>"
            popup_content += f"<td style='border: 1px solid black; padding: 5px;'>{row[col]}</td></tr>"
    popup_content += "</table>"
    return popup_content

def strainmap(df, outfile, 
              antismash_server_base="localhost:8002/antismash/7.1.0", 
              popup_columns=['genome_id', 'latitude', 'longitude'],
              tiles="cartodb positron"
             ):
    """
    Sample DataFrame:
    df = pd.DataFrame({
         'genome_id': ['NBC_00076', 'NBC_00077', 'NBC_00078'],
         'lat': [55.471, 55.572, 55.673],
         'lon': [10.654, 10.754, 10.854],
         'location': ['Edinburgh', 'Glasgow', 'Aberdeen'],
         'country': ['Scotland', 'Scotland', 'Scotland']
    })
    """
    # Calculate the mean latitude and longitude
    mean_lat = df['latitude'].mean()
    mean_lon = df['longitude'].mean()
    
    # Initiate map using the calculated mean coordinates as a starting point
    m = folium.Map(location=[mean_lat, mean_lon], zoom_start=4, tiles=tiles)
    
    # Create a marker cluster
    marker_cluster = MarkerCluster().add_to(m)
    
    # Fill in data points for the map using marker clusters
    for _, row in df.iterrows():
        popup_content = create_popup(row, popup_columns, base_url)
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=folium.Popup(popup_content, max_width=300)
        ).add_to(marker_cluster)
    
    outfile = Path(outfile)
    outfile.parent.mkdir(exist_ok=True, parents=True)
    m.save(outfile)
    
    # Display the map
    return m

In [None]:
report_dir = Path("../data/processed/G1032_20240208/")

dependency_version = report_dir / "metadata/dependency_versions.json"
with open(dependency_version, "r") as file:
    dependency_version = json.load(file)
antismash_version = dependency_version["antismash"]

# Load tables with GPS coordinates
df_nbc = pd.read_csv(report_dir / f"tables/df_antismash_{antismash_version}_summary.csv", low_memory=False)
df_taxa = pd.read_csv(report_dir / "tables/df_gtdb_meta.csv")
df_nbc = df_nbc.merge(df_taxa, left_on="genome_id", right_on="genome_id")

In [None]:
# Cleaning GPS data into decimal degrees format
converted_data = []
for idx in df_nbc.index:
    genome_id = df_nbc.loc[idx, "genome_id"]
    coord = df_nbc.loc[idx, 'gps_coordinates']
    if type(coord) == str:
        try:
            lat, lon = convert_coordinates(coord)
            converted_data.append([genome_id, lat, lon])
        except (ValueError, TypeError) as ve:
            print(f"Error parsing coordinates {coord}: {ve}")

# Convert to DataFrame for better display
df_converted = pd.DataFrame(converted_data, columns=['genome_id', 'latitude', 'longitude'])
df_converted = df_converted.merge(df_nbc, left_on="genome_id", right_on="genome_id")

In [None]:
# Check how many samples don't have GPS
df_no_gps = df_nbc[~df_nbc['gps_coordinates'].notna()]
df_no_gps.loc[:, ['genome_id', 'gps_coordinates', 'location', 'country', 'soil_sample_name', 'description_of_soil_sample', 'comments', 'bgcs_count', 'bgcs_on_contig_edge']]

In [None]:
text = f"There are {len(df_no_gps)} genomes that does not have GPS coordinates."
text

In [None]:
# List of columns to include in the popup
popup_columns = ['genome_id', 'Genus', 'Organism', 'location', 'country', 'soil_sample_name', 'description_of_soil_sample',
                'comments', 'bgcs_count', 'bgcs_on_contig_edge']  # Adjust this list based on your DataFrame

# Base URL for genome ID links
base_url = 'https://nbc.secondarymetabolites.org/nbc/2023-05-22'
outfile = Path('../figures/strainmap_G1032_20240208.html')

In [None]:
# drop data without GPS
df = df_converted[df_converted['latitude'].notna()]
strainmap(df, outfile, antismash_server_base=base_url, popup_columns=popup_columns)

In [None]:
outfile = Path("../tables/df_antismash_7.1.0_summary_with_gps.csv")
outfile.parent.mkdir(exist_ok=True, parents=True)
df.to_csv(outfile, index=False)