In [None]:
# Import required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from itertools import combinations
import networkx as nx
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Step 1: Load the dataset
dataset_path = '/kaggle/input/ednaaaaaaa/edna new merged data.xlsx'

# Verify the file exists
if os.path.exists(dataset_path):
    df = pd.read_excel(dataset_path)
    print("Dataset loaded successfully. First few rows:")
    print(df.head())
else:
    print(f"File not found at {dataset_path}. Please check the dataset path.")
    print("Available files in /kaggle/input:")
    print(os.listdir('/kaggle/input'))
    raise FileNotFoundError("Stopping execution due to missing dataset.")

# Step 2: Verify required columns for the plots
required_columns = ['env_broad_scale', 'target_gene', 'pcr_primer_name_forward', 
                    'scientificName', 'eventDate', 'sampleSizeValue', 'locality', 'seq_meth', 'decimalLatitude']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Warning: Missing columns {missing_columns}. Some plots may not be generated.")
else:
    print("All required columns are present.")

# Step 3: Create a 40% subset of the data for all plots
subset_fraction = 0.4
df_subset = df.sample(frac=subset_fraction, random_state=42)
print(f"Using a {int(subset_fraction*100)}% subset of the data for all plots.")

# Step 4: Plot 1 - Species Diversity Across Environmental Scales
if 'env_broad_scale' in df.columns and 'scientificName' in df.columns and 'locality' in df.columns:
    species_diversity = df_subset.groupby('locality')['scientificName'].nunique().reset_index(name='species_count')
    env_diversity = pd.merge(df_subset[['locality', 'env_broad_scale']].drop_duplicates(), species_diversity, on='locality')
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='env_broad_scale', y='species_count', data=env_diversity)
    plt.xticks(rotation=45)
    plt.title(f"Species Diversity Across Environmental Scales ({int(subset_fraction*100)}% Subset)")
    plt.xlabel("Environmental Scale")
    plt.ylabel("Species Count")
    plt.show()
else:
    print("Skipping Plot 1: Required columns (env_broad_scale, scientificName, locality) missing.")

# Step 5: Plot 2 - Primer Effectiveness by Taxonomic Group
if 'pcr_primer_name_forward' in df.columns and 'scientificName' in df.columns:
    primer_taxa = pd.crosstab(df_subset['pcr_primer_name_forward'], df_subset['scientificName'])
    plt.figure(figsize=(12, 6))
    sns.heatmap(primer_taxa, cmap="YlGnBu", annot=True, fmt='d')
    plt.title(f"Primer Effectiveness by Taxonomic Group ({int(subset_fraction*100)}% Subset)")
    plt.xlabel("Taxonomic Group")
    plt.ylabel("Primer Name")
    plt.show()
else:
    print("Skipping Plot 2: Required columns (pcr_primer_name_forward, scientificName) missing.")

# Step 6: Plot 3 - Species Presence Over Time
if 'eventDate' in df.columns and 'scientificName' in df.columns:
    temporal_counts = df_subset.groupby('eventDate')['scientificName'].nunique()
    plt.figure(figsize=(10, 6))
    temporal_counts.plot(kind='line')
    plt.title(f"Species Presence Over Time ({int(subset_fraction*100)}% Subset)")
    plt.xlabel("Date")
    plt.ylabel("Unique Species Count")
    plt.xticks(rotation=45)
    plt.show()
else:
    print("Skipping Plot 3: Required columns (eventDate, scientificName) missing.")

# Step 7: Plot 4 - Sample Size vs. Species Detection Sensitivity
if 'sampleSizeValue' in df.columns and 'scientificName' in df.columns:
    sample_species_counts = df_subset.groupby('sampleSizeValue')['scientificName'].nunique()
    plt.figure(figsize=(10, 6))
    sns.regplot(x=sample_species_counts.index, y=sample_species_counts.values)
    plt.title(f"Sample Size vs. Species Detection Sensitivity ({int(subset_fraction*100)}% Subset)")
    plt.xlabel("Sample Size")
    plt.ylabel("Species Detection Count")
    plt.show()
else:
    print("Skipping Plot 4: Required columns (sampleSizeValue, scientificName) missing.")

# Step 8: Plot 5 - Species Co-occurrence Network
if 'locality' in df.columns and 'scientificName' in df.columns:
    pairs = [(a, b) for locality, group in df_subset.groupby('locality') 
             for a, b in combinations(group['scientificName'].unique(), 2)]
    G = nx.Graph()
    G.add_edges_from(pairs)
    plt.figure(figsize=(12, 8))
    nx.draw_networkx(G, with_labels=True, node_size=50, font_size=8)
    plt.title(f"Species Co-occurrence Network ({int(subset_fraction*100)}% Subset)")
    plt.show()
else:
    print("Skipping Plot 5: Required columns (locality, scientificName) missing.")

# Step 9: Plot 6 - Detection Bias by Sequencing Method
if 'seq_meth' in df.columns and 'scientificName' in df.columns:
    detection_bias = pd.crosstab(df_subset['seq_meth'], df_subset['scientificName'])
    plt.figure(figsize=(12, 6))
    sns.heatmap(detection_bias, cmap="YlOrBr", annot=True, fmt='d')
    plt.title(f"Detection Bias by Sequencing Method ({int(subset_fraction*100)}% Subset)")
    plt.xlabel("Taxonomic Group")
    plt.ylabel("Sequencing Method")
    plt.show()
else:
    print("Skipping Plot 6: Required columns (seq_meth, scientificName) missing.")

# Step 10: Plot 7 - Species Richness by Taxonomic Group
if 'scientificName' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df_subset, y='scientificName', order=df_subset['scientificName'].value_counts().index)
    plt.title(f"Species Richness by Taxonomic Group ({int(subset_fraction*100)}% Subset)")
    plt.xlabel("Count")
    plt.ylabel("Taxonomic Group")
    plt.show()
else:
    print("Skipping Plot 7: Required column (scientificName) missing.")

# Step 11: Plot 8 - Species Diversity Across Latitudes
if 'decimalLatitude' in df.columns and 'scientificName' in df.columns:
    # Define bin size for latitudes
    bin_size = 5  # degrees
    # Create bins based on the subset's latitude range
    min_lat = df_subset['decimalLatitude'].min()
    max_lat = df_subset['decimalLatitude'].max()
    bins = np.arange(np.floor(min_lat / bin_size) * bin_size, np.ceil(max_lat / bin_size) * bin_size + bin_size, bin_size)
    # Bin the latitudes
    df_subset['lat_bin'] = pd.cut(df_subset['decimalLatitude'], bins=bins)
    # Group by lat_bin and calculate unique species
    lat_diversity = df_subset.groupby('lat_bin')['scientificName'].nunique().reset_index(name='species_count')
    # Plot
    plt.figure(figsize=(12, 6))
    sns.barplot(x='lat_bin', y='species_count', data=lat_diversity)
    plt.xticks(rotation=45)
    plt.title(f"Species Diversity Across Latitudes ({int(subset_fraction*100)}% Subset)")
    plt.xlabel("Latitude Range")
    plt.ylabel("Number of Unique Species")
    plt.show()
else:
    print("Skipping Plot 8: Species Diversity Across Latitudes - Required columns (decimalLatitude, scientificName) missing.")

# Step 12: Plot 9 - Density of Marine Species Occurrences (KDE)
def plot_species_kde_with_legend(species_df):
    """
    Plot the Kernel Density Estimate (KDE) of species occurrences with a legend.
    
    Args:
    - species_df: DataFrame containing species occurrences.
    """
    plt.figure(figsize=(10, 8))
    
    # KDE plot using seaborn with fill
    kde = sns.kdeplot(x=species_df['decimalLongitude'], y=species_df['decimalLatitude'], 
                      cmap='Blues', fill=True, levels=10, thresh=0)
    
    # Add colorbar
    sm = plt.cm.ScalarMappable(cmap='Blues', norm=plt.Normalize(vmin=0, vmax=10))
    sm.set_array([])  # For the colorbar to work correctly
    plt.colorbar(sm, label='Density')

    # Title and labels
    plt.title("Density of Marine Species Occurrences (KDE)", fontsize=15)
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    
    # Show the plot
    plt.show()

# Example usage with your cleaned marine species data
plot_species_kde_with_legend(marine_species_cleaned)

# Final confirmation
print(f"All available plots generated successfully using {int(subset_fraction*100)}% of the data!")