# Visualizing the data
In this notebook we will creae several visualization based on the curated metadata. <br>
The goal is to start providing context to the data base and investigate the distribution of fermented foods and thier microbiome. <br>
The data will not reflect true abundance of microbes in different foods/areas since it is based on presensce/absence counts from the metadata table. <br>

In [None]:
# importing standard libraries
import pandas as pd
import numpy as np

# importing plotly libraries for dinamic visualization
# make sure plotly is installed in your environment
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff
import plotly.subplots as sp
import plotly.offline as pyo
import plotly.tools as tls
import plotly


In [3]:
# loading the dataset
dataset = pd.read_csv('data/Food_MAGs_curated_metadata_250421_corrected_merged_final_v2.csv')

display(dataset.head(), dataset.shape)

Unnamed: 0,mag_id,sample_description,fermented_food,specific_substrate,substrate_category,general_category,completeness,contamination,contigs,total_length,...,main_ferment,acid_type,acid_level,alcohol_lovel,protein_degradation,fat_degradation,added_ingredients,fermentation_temp,aging_time,agin_temp
0,C-03.Ssa-BR,raw canastra cheese,cheese,milk,dairy,fermented_dairy,97.97,1.1,182,1896140,...,acid; amino acids,lactic,medium,none,high,medium,salt; rennet,mesophilic,>1 month,cold
1,C-R02.bin.1,raw canastra cheese,cheese,milk,dairy,fermented_dairy,92.75,1.03,84,3174852,...,acid; amino acids,lactic,medium,none,high,medium,salt; rennet,mesophilic,>1 month,cold
2,C-R03.bin.7,raw canastra cheese,cheese,milk,dairy,fermented_dairy,98.91,0.0,161,2047554,...,acid; amino acids,lactic,medium,none,high,medium,salt; rennet,mesophilic,>1 month,cold
3,C-R06.bin.10,raw araxa cheese,cheese,milk,dairy,fermented_dairy,91.62,1.35,147,3636195,...,acid; amino acids,lactic,medium,none,medium,low,salt; rennet,mesophilic,>1 month,cold
4,C-R06.bin.12,raw araxa cheese,cheese,milk,dairy,fermented_dairy,100.0,0.64,54,3940686,...,acid; amino acids,lactic,medium,none,medium,low,salt; rennet,mesophilic,>1 month,cold


(13858, 45)

In [34]:
# visualizing the number of samples (sample_accession) per country
# Get unique sample_accession and country pairs
unique_samples = dataset[['sample_accession', 'country']].drop_duplicates()

# # Count samples per country and sort in descending order
country_counts = unique_samples.value_counts('country').sort_values(ascending=False)
country_counts = country_counts.reset_index()
country_counts.columns = ['country', 'count']
# remove values that are lists
country_counts = country_counts[~country_counts['country'].str.contains('\[', na=False)]

# Create horizontal bar plot
fig = px.bar(country_counts, 
             y='country', 
             x='count',
             orientation='h',
             title="Number of unique samples per country",
             category_orders={'country': country_counts['country'].tolist()},
             height=800,
             width=800,)  

fig.update_layout(xaxis_title="Number of unique samples",
                 yaxis_title="Country",
                 yaxis=dict(tickfont=dict(size=8)))  # reduced font size for y-axis labels

fig.update_traces(marker_color='blue', marker_line_color='black',
                  marker_line_width=1.5, opacity=0.6)
fig.show()

In [43]:
# Define continent mapping
continent_map = {
    'Italy': 'Europe', 'Ireland': 'Europe', 'Spain': 'Europe', 'Austria': 'Europe', 
    'USA': 'North America', 'United Kingdom': 'Europe', 'Canada': 'North America',
    'Genrmany': 'Europe', 'France': 'Europe', 'China': 'Asia', 'Nigeria': 'Africa',
    'Australia': 'Oceania', 'Mexico': 'North America', 'Belgium': 'Europe',
    'Taiwan': 'Asia', 'India': 'Asia', 'Brazil': 'South America', 'Ghana': 'Africa',
    'Denmark': 'Europe', 'Singapore': 'Asia', 'Greece': 'Europe', 'Malaysia': 'Asia',
    'Norway': 'Europe', 'Turkey': 'Asia', 'Saudi Arabia': 'Asia', 
    'South Africa': 'Africa', 'Bulgaria': 'Europe', 'Russia': 'Europe',
    'Indonesia': 'Asia', 'Ecuador': 'South America', 'New Zealand': 'Oceania',
    'Finland': 'Europe', 'Korea': 'Asia', 'Colombia': 'South America',
    'Hong Kong': 'Asia', 'Estonia': 'Europe', 'hilippines': 'Asia',
    'Burkina Faso': 'Africa', 'Sweden': 'Europe', 'Tunisia': 'Africa',
    'Aland Islands': 'Europe', 'Croatia': 'Europe', 'Israel': 'Asia',
    'Lebanon': 'Asia', 'Portugal': 'Europe', 'Kenya': 'Africa',
    'Boliva': 'South America', 'Switzerland': 'Europe', 'Ethiopia': 'Africa',
    'Benin': 'Africa', 'Japan': 'Asia', 'Germany': 'Europe', 'Thailand': 'Asia',
    'Myanmar': 'Asia', 'Korean': 'Asia', 'Turkey ': 'Asia', 'Belgium:Antwerp': 'Europe'
}

# Add continent column
country_counts['continent'] = country_counts['country'].map(continent_map)

# Create scatter_geo plot with color by continent
fig = px.scatter_geo(country_counts,
                    locations='country',
                    locationmode='country names',
                    size='count',
                    color='continent',
                    hover_name='country',
                    hover_data={'count': True, 'continent': True},
                    projection='natural earth',
                    title='Number of Samples per Country')

fig.update_traces(marker=dict(opacity=0.6, line=dict(width=1, color='black')))

fig.update_layout(
    title_x=0.5,
    geo=dict(
        showframe=True,
        showcoastlines=True,
        projection_type='equirectangular',
        # Add zoom capabilities
        scope='world',
        showland=True,
        showcountries=True,
        landcolor='rgb(243, 243, 243)',
        countrycolor='rgb(204, 204, 204)',
        # Enable zoom
        visible=True
    ),
    width=1000,
    height=600
)

# Add buttons for zooming to different regions
fig.update_layout(
    updatemenus=[{
        'buttons': [
            {'args': [{'geo.scope': 'world'}], 'label': 'World', 'method': 'relayout'},
            {'args': [{'geo.scope': 'europe'}], 'label': 'Europe', 'method': 'relayout'},
            {'args': [{'geo.scope': 'asia'}], 'label': 'Asia', 'method': 'relayout'},
            {'args': [{'geo.scope': 'africa'}], 'label': 'Africa', 'method': 'relayout'},
            {'args': [{'geo.scope': 'north america'}], 'label': 'North America', 'method': 'relayout'},
            {'args': [{'geo.scope': 'south america'}], 'label': 'South America', 'method': 'relayout'}
        ],
        'direction': 'down',
        'showactive': True,
        'x': 1.2,
        'y': 0.6  # Moved down from 0.9 to 0.75 to place below typical legend position
    }]
)

fig.show()