In [None]:
#plot vOTU absence/presence heatmap across continents
import os
import gzip
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from scipy.cluster.hierarchy import linkage, leaves_list

# Function to classify continent based on latitude and longitude
def get_continent(latitude, longitude):
    # North America
    if 23.0 < latitude <= 60.0 and -135.0 <= longitude <= -60.0:
        return "North America"
    
    # South America
    elif -56.0 <= latitude <= 15.0 and -90.0 <= longitude <= -30.0:
        return "South America"
    
    # Europe
    elif 30.0 <= latitude <= 85.0 and -15.0 < longitude < 45.0:
        return "Europe"
    
    # Asia
    elif 0.0 <= latitude <= 77.0 and 45.0 <= longitude <= 165.0:
        return "Asia"
    
    # Africa
    elif -30.0 <= latitude < 60.0 and 15.0 <= longitude < 45.0:
        return "Africa"
    
    # If not within any of the above bounds
    else:
        return "Continent not recognized"
        print(latitude, longitude)


# Function to extract vOTUs from filenames in a given folder
def extract_votus(folder_path):
    votus = []
    ns = []
    
    # Iterate over files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".fna"):
            # Extract vOTU part from the filename (before the first underscore)
            votu = filename.split('_')[0]
            n = filename.split('_')[1].split('.')[0][1::]
            votus.append(votu)
            ns.append(int(n))
    
    return votus, ns

# Example usage: path to the folder containing the vOTUs
folder_path = "/Users/jferrare/Documents/Good Lab Work/Gut Phage/January_2025/large_vOTUs"
votu_list, ns_list = extract_votus(folder_path)

# Load sequence-to-vOTU mapping from a file (assuming it's a TSV file)
sequence_to_votu = {}
with gzip.open(meta_data, 'rt') as f:
    for line in f:
        if line.startswith('uhgv_genome'):
            continue
        parts = line.strip().split('\t')
        sequence_to_votu[parts[0]] = parts[1]


# Initialize binary columns for each continent
votu_continents = {votu: {'North America': 0, 'South America': 0, 'Europe': 0, 'Asia': 0} for votu in votu_list}

# Iterate over sequences and assign continent columns based on coordinates
for seq, lat, lon in zip(sequences, lats, longs):
    votu = sequence_to_votu[seq]
    continent = get_continent(lat, lon)
    if continent in ['North America', 'South America', 'Europe', 'Asia', 'Africa'] and votu in votu_list:
        votu_continents[votu][continent] = 1

# Create a DataFrame for the heatmap
heatmap_data = []

# Fill the heatmap_data list with 1 or 0 for each vOTU and continent
for continent in ['North America', 'South America', 'Europe', 'Asia', 'Africa']:
    continent_data = []
    for votu in votu_list:
        continent_data.append(votu_continents.get(votu, {}).get(continent, 0))
    heatmap_data.append(continent_data)

# Convert the list into a DataFrame for easier visualization
heatmap_df = pd.DataFrame(heatmap_data, columns=votu_list, index=['North America', 'South America', 'Europe', 'Asia', 'Africa'])

# Perform hierarchical clustering on vOTUs based on their continent presence
votu_similarity = linkage(heatmap_df.T, method='ward', metric='euclidean')  # Perform hierarchical clustering
ordered_votus = heatmap_df.columns[leaves_list(votu_similarity)].to_list()  # Get ordered vOTUs

# Reorder the DataFrame columns based on clustering results
heatmap_df = heatmap_df[ordered_votus]

# Modify the x-axis labels to include n values
ordered_ns = [ns_list[votu_list.index(votu)] for votu in ordered_votus]
ordered_votu_labels = [f"{votu} (n={n})" for votu, n in zip(ordered_votus, ordered_ns)]
print(len(ordered_votu_labels))
# Create the heatmap
plt.figure(figsize=(20, 2))
sns.heatmap(heatmap_df, cmap='binary', cbar=False, linecolor='red', annot=False)

# Set labels and title
#make xtickslabel fontsize smaller
plt.subplots_adjust(bottom=.5)
plt.xticks(fontsize=3)
plt.xlabel("vOTUs")
plt.ylabel("Continents")

# Update x-axis with labels that include 'n'
plt.xticks(ticks=np.arange(len(ordered_votu_labels)), labels=ordered_votu_labels, rotation=90)

# Show the heatmap
plt.tight_layout()
plt.show()



In [None]:
#create a html proportional map of vOTU presence across the globe
import folium
import pandas as pd
from collections import Counter

z = list(zip(lats, longs))

# Find unique combinations of latitudes and longitudes and count instances
counts = Counter(z)

# Initialize an empty list to store the results
lat_long = []

# Iterate through counts and append to the list
for key, value in counts.items():
    lat_long.append([key[0], key[1], value])

# Convert the list into a DataFrame
df = pd.DataFrame(lat_long, columns=['Latitude', 'Longitude', 'Count'])
df = df[(df['Latitude'] != 0) | (df['Longitude'] != 0)]
print(df)
# Create a base map centered globally (around [0, 0]) and no zoom to show the whole globe
m = folium.Map(location=[0, 0], zoom_start=1)

# Define a size scale based on the counts (you can adjust this factor to control the scaling)
size_factor = 0.002  # Adjust this to make the circles larger or smaller

# Add circle markers to the map
for _, row in df.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=row['Count'] * size_factor,  # Size proportional to count
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        popup=f"Count: {row['Count']}"
        
    ).add_to(m)


# Save the map to an HTML file
m.save('/Users/jferrare/Downloads/proportional_map.html')

