In [None]:
import polars as pl
import plotly.express as px
from scipy.stats import chi2
import numpy as np  
import matplotlib.pyplot as plt

In [None]:
from matplotlib import font_manager
prop = font_manager.FontProperties(fname="/home/joseph/.fonts/texgyreheroscn-regular.otf")
prop

In [None]:
from needletail import parse_fastx_file, NeedletailError, reverse_complement, normalize_seq
ASESMBLY_FASTA_FILE = "../a9_genome_masked.fa"
#MIN_LENGTH = 1_000_000
MIN_LENGTH = 0

chr_lengths = {}

try:
    for record in parse_fastx_file(ASESMBLY_FASTA_FILE):
        chr_lengths[record.id] = len(record.seq)
except NeedletailError:
    print("Invalid Fastq file")

# How many are >= MIN_LENGTH
long_chrs = {k: v for k, v in chr_lengths.items() if v >= MIN_LENGTH}
print(f"Number of chromosomes >= {MIN_LENGTH:,} bp: {len(long_chrs)}")

In [None]:
long_chrs

# PL-LMM-LRT ml-reml

In [None]:
gwas_df = pl.read_csv("/mnt/data/development/consgwas/hoiho/results_wholepop_final_glmm_plrt/gwas_results/glmm_plrt_rds_status_rds_status_results.csv")
northern_gwas_df = pl.read_csv("/mnt/data/development/consgwas/hoiho/results_northernpop_final_glmm_plrt/gwas_results/glmm_plrt_rds_status_rds_status_results.csv")
gv_gwas_df = pl.read_csv("/mnt/data/development/consgwas/hoiho/results_wholepop_final_glmm_plrt/gwas_results/glmm_plrt_gv_status_gv_status_results.csv")

# Indels
indel_gwas_df = pl.read_csv("/mnt/data/development/consgwas/hoiho/results_wholepop_final_glmm_plrt_indels/gwas_results/glmm_plrt_rds_status_rds_status_results.csv")
 
# Imputed
imputed_gwas_df = pl.read_csv("/mnt/data/development/consgwas/hoiho/results_wholepop_imputed_final_glmm_plrt/gwas_results/glmm_plrt_rds_status_rds_status_results.csv")

# gwas_df = pl.read_csv("/mnt/data/development/consgwas/hoiho/results_wholepop_final_glmm_plrt/gwas_results/glmm_plrt_gv_status_gv_status_results.csv")
# northern_gwas_df = pl.read_csv("/mnt/data/development/consgwas/hoiho/results_northernpop_final_glmm_plrt/gwas_results/glmm_plrt_gv_status_gv_status_results.csv")

# glmm_plrt_ds_status_ds_status_results.csv
# gwas_df = pl.read_csv("/mnt/data/development/consgwas/hoiho/results_wholepop_final_glmm_plrt/gwas_results/glmm_plrt_ds_status_ds_status_results.csv")
# northern_gwas_df = pl.read_csv("/mnt/data/development/consgwas/hoiho/results_northernpop_final_glmm_plrt/gwas_results/glmm_plrt_ds_status_ds_status_results.csv")

gwas_df.head()

In [None]:
import matplotlib.pyplot as plt

def process_gwas_for_qq(df):
    """
    Processes a GWAS dataframe to get values for a QQ plot and calculates lambda GC.
    
    Args:
        df: A dataframe with a 'P' column containing p-values.
    
    Returns:
        A tuple containing:
        - expected_log: -log10 transformed expected p-values
        - observed_log: -log10 transformed observed p-values
        - lambda_gc: The genomic inflation factor (lambda GC)
    """
    # Extract p-values and remove any NaNs
    pvals = df["P"].to_numpy()
    pvals = pvals[~np.isnan(pvals)]

    # --- Calculate Lambda GC ---
    # Convert p-values to chi-squared statistics
    chisq = chi2.isf(pvals, df=1)
    # Calculate lambda GC as the ratio of the median of observed vs expected chi-squared stats
    lambda_gc = np.median(chisq) / chi2.ppf(0.5, 1) # chi2.ppf(0.5, 1) is approx 0.4549

    # --- Prepare p-values for QQ plot ---
    pvals = np.sort(pvals)
    n = len(pvals)
    
    # Calculate expected p-values under the null hypothesis
    expected = np.linspace(1 / (n + 1), n / (n + 1), n)

    # Apply -log10 transformation
    expected_log = -np.log10(expected)
    observed_log = -np.log10(pvals)
    
    return expected_log, observed_log, lambda_gc

# Process both of your GWAS result dataframes
expected1, observed1, lambda1 = process_gwas_for_qq(gwas_df)
expected2, observed2, lambda2 = process_gwas_for_qq(northern_gwas_df)

# Imputed
expected3, observed3, lambda3 = process_gwas_for_qq(imputed_gwas_df)

# Indel
expected4, observed4, lambda4 = process_gwas_for_qq(indel_gwas_df)

# GV Status
expected5, observed5, lambda5 = process_gwas_for_qq(gv_gwas_df)

# --- Create the Plot ---
plt.figure(figsize=(8, 8)) # A square figure is best for QQ plots

# Scatter plot for the first GWAS (Whole population)
plt.scatter(expected1, observed1, color='#CABEE9', alpha=0.7, s=12, label='Whole Population')

# Scatter plot for the second GWAS (Northern population)
# plt.scatter(expected2, observed2, color='#7C7189', alpha=0.7, s=12, label='Northern Population')

# Scatter plot for the imputed GWAS
# plt.scatter(expected3, observed3, color='#FF6F91', alpha=0.7, s=12, label='Imputed Variants')

# Scatter plot for the indel GWAS
# plt.scatter(expected4, observed4, color='#1982C4', alpha=0.7, s=12, label='Indels')

# Scatter plot for the GV status GWAS
plt.scatter(expected5, observed5, color='#F6D55C', alpha=0.7, s=12, label='GV Status')

# Determine the maximum value to make the plot square and the line corner-to-corner
max_val = max(np.max(expected1), np.max(observed1), np.max(expected2), np.max(observed2), np.max(expected3), np.max(observed3), np.max(expected4), np.max(observed4))
plot_limit = np.ceil(max_val) # Round up to the nearest integer for a clean axis

# Plot the y=x line
plt.plot([0, plot_limit], [0, plot_limit], color='#FAE093', linestyle='--', lw=2, label='y=x')

# Set axis limits to be equal, ensuring a square plot
plt.xlim(0, plot_limit)
plt.ylim(0, plot_limit)


# Add labels and the title with Lambda GC values
plt.xlabel('-log10(Expected p-value)', fontsize=12)
plt.ylabel('-log10(Observed p-value)', fontsize=12)
plt.title(
    f'QQ Plot of RDS Status GWAS p-values\n'
    # f'λGC (Whole Pop) = {lambda1:.3f}  |  λGC (Northern Pop) = {lambda2:.3f} |  λGC (Imputed) = {lambda3:.3f} | λGC (Indels) = {lambda4:.3f}',
    f'λGC (Whole Pop) = {lambda1:.3f}  |  λGC (GV Status) = {lambda5:.3f}',
    fontsize=14
)

plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Add -log10p and then sort by it
# Remove -log10p nulls
gwas_df = gwas_df.with_columns(
    (-pl.col("P").log10()).alias("-log10p")
)
gwas_df = gwas_df.filter(pl.col("-log10p").is_not_null())
gwas_df = gwas_df.sort("-log10p", descending=True)
#gwas_df.head()

# Same for northern
northern_gwas_gwas = northern_gwas_df.with_columns(
    (-pl.col("P").log10()).alias("-log10p")
)
northern_gwas_gwas = northern_gwas_gwas.filter(pl.col("-log10p").is_not_null())
northern_gwas_gwas = northern_gwas_gwas.sort("-log10p", descending=True)
#northern_gwas_gwas.head()

# Same for indels and imputed
indel_gwas_gwas = indel_gwas_df.with_columns(
    (-pl.col("P").log10()).alias("-log10p")
)
indel_gwas_gwas = indel_gwas_gwas.filter(pl.col("-log10p").is_not_null())
indel_gwas_gwas = indel_gwas_gwas.sort("-log10p", descending=True)
#indel_gwas_gwas.head()

imputed_gwas_gwas = imputed_gwas_df.with_columns(
    (-pl.col("P").log10()).alias("-log10p")
)
imputed_gwas_gwas = imputed_gwas_gwas.filter(pl.col("-log10p").is_not_null())
imputed_gwas_gwas = imputed_gwas_gwas.sort("-log10p", descending=True)
#imputed_gwas_gwas.head()

gv_gwas_df = gv_gwas_df.with_columns(
    (-pl.col("P").log10()).alias("-log10p")
)
gv_gwas_df = gv_gwas_df.filter(pl.col("-log10p").is_not_null())
gv_gwas_df = gv_gwas_df.sort("-log10p", descending=True)
gv_gwas_df.head()

In [None]:
gwas_df

In [None]:
gv_gwas_df

In [None]:
gv_gwas_df.columns

In [None]:
# Do an intersect join on marker, keeping only the -log10p columns from each dataframe
merged_df = gwas_df.join(
    gv_gwas_df.select(["Marker", "-log10p"]),
    on="Marker",
    how="inner",
    suffix="_gv"
)
# Keep only the marker and -log10p columns (-log10p, and -log10p_gv)
merged_df = merged_df.select(["Marker", "-log10p", "-log10p_gv"])
merged_df

In [None]:
# Correlation between the two

# Only look at the top 5%

# Drop everything below 'baseline' of -log10p of 2.0 in either dataset
threshold = 0.5
merged_df_filtered = merged_df.filter(
    (pl.col("-log10p") >= threshold) | (pl.col("-log10p_gv") >= threshold)
)

px.scatter(
    merged_df_filtered.to_pandas(),
    x="-log10p",
    y=merged_df_filtered["-log10p_gv"].to_numpy(),
    trendline="ols",
    labels={
        "-log10p": "-log10 p-value (RDS Status GWAS)",
        "y": "-log10 p-value (GV Status GWAS)"
    },
    title="Correlation of -log10 p-values between RDS Status GWAS and GV Status GWAS"
)

In [None]:
# np.corrcoef (for r^2)
np.corrcoef(merged_df_filtered["-log10p"].to_numpy(), merged_df_filtered["-log10p_gv"].to_numpy())

In [None]:
long_chrs

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

# =============================================================================
# Matplotlib Drop-in Replacement Cell
#
# This cell assumes the following variables already exist in your environment:
#   - gwas_df:      A polars DataFrame with GWAS results for the first plot.
#   - gv_gwas_df:   A polars DataFrame with GWAS results for the second plot.
#   - long_chrs:    A Python dictionary mapping chromosome/contig names to their lengths.
# =============================================================================

# --- Font and Plotting Style Configuration ---
try:
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['font.sans-serif'] = ['Tex Gyre Heros', 'Helvetica', 'Arial']
    mpl.rcParams['axes.spines.top'] = False
    mpl.rcParams['axes.spines.right'] = False
    mpl.rcParams['xtick.major.size'] = 7
    mpl.rcParams['ytick.major.size'] = 7
    mpl.rcParams['xtick.labelsize'] = 12
    mpl.rcParams['ytick.labelsize'] = 12
except Exception as e:
    print(f"Could not set Tex Gyre Heros font. Using default sans-serif. Error: {e}")

# --- Configuration (Matching Original Plotly Setup) ---
P_VALUE_CUTOFF = 0.0
MARKER_SIZE = 6
GAP_FACTOR = 0.05
LABEL_EVERY_NTH_CHR = 3
NUM_CHROMS_TO_LABEL = 30

# --- Data Preparation Function (pandas equivalent of your polars function) ---
def prepare_manhattan_data_pandas(df_pandas, chr_lengths_dict):
    """
    Prepares a pandas DataFrame for Manhattan plotting, mirroring the original polars logic.
    """
    df = df_pandas.copy()
    # Calculate -log10(p) and apply cutoff
    df['-log10p'] = -np.log10(df['P'])
    df = df[df['-log10p'] >= P_VALUE_CUTOFF]

    # Sort chromosomes by length (descending) to establish the plotting order
    sorted_chroms_by_length = sorted(chr_lengths_dict, key=chr_lengths_dict.get, reverse=True)
    
    # Filter the sort order to only include chromosomes present in the data
    present_chroms = df['Chr'].unique()
    sorted_chroms_present = [c for c in sorted_chroms_by_length if c in present_chroms]
    
    # Apply the categorical ordering and sort the DataFrame
    df['Chr'] = pd.Categorical(df['Chr'], categories=sorted_chroms_present, ordered=True)
    df = df.sort_values(['Chr', 'Pos'])

    # Calculate the cumulative offset for each chromosome
    mean_length = np.mean(list(chr_lengths_dict.values()))
    gap_size = int(mean_length * GAP_FACTOR)
    
    offsets, current_offset, ticks_data = {}, 0, []
    for i, chrom in enumerate(sorted_chroms_present):
        offsets[chrom] = current_offset
        chrom_length = chr_lengths_dict.get(chrom, 0)
        ticks_data.append({
            'Chr': chrom, 'Chr_order': i,
            'tick_pos': current_offset + chrom_length / 2,
            'offset': current_offset, 'length': chrom_length
        })
        current_offset += chrom_length + gap_size

    # Apply the offset to create the final x-coordinate for plotting
    df['offset'] = df['Chr'].map(offsets).astype(np.int64)
    df['x'] = df['Pos'].astype(np.int64) + df['offset']
    
    ticks_df = pd.DataFrame(ticks_data)
    return df, ticks_df

# --- Data Conversion and Preparation ---
# 1. Convert your existing polars DataFrames to pandas DataFrames
gwas_df_pd = gwas_df.to_pandas()
gv_gwas_df_pd = gv_gwas_df.to_pandas()

# 2. Prepare each pandas DataFrame for plotting
plot_df_whole, ticks_df = prepare_manhattan_data_pandas(gwas_df_pd, long_chrs)
plot_df_gv, _ = prepare_manhattan_data_pandas(gv_gwas_df_pd, long_chrs)


# --- Plotting with Matplotlib ---
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 9), sharex=True)
fig.subplots_adjust(hspace=0.1)

# Define colors to match the original plot
colors = ["#7C7189", "#BC8E7D"] # Purple-grey, Brown

# Loop through the chromosomes in their length-sorted order and plot
for i, row in ticks_df.iterrows():
    chrom, color = row['Chr'], colors[i % len(colors)]
    
    # Plot for the top panel (RDS Status)
    df_subset_whole = plot_df_whole[plot_df_whole['Chr'] == chrom]
    if not df_subset_whole.empty:
        ax1.scatter(df_subset_whole['x'], df_subset_whole['-log10p'], color=color, s=MARKER_SIZE, alpha=0.8, rasterized=True)

    # Plot for the bottom panel (GV Status)
    df_subset_gv = plot_df_gv[plot_df_gv['Chr'] == chrom]
    if not df_subset_gv.empty:
        ax2.scatter(df_subset_gv['x'], df_subset_gv['-log10p'], color=color, s=MARKER_SIZE, alpha=0.8, rasterized=True)


# --- Customize Layout and Axes (Replicating Original Logic) ---
# Select which chromosome ticks to display
candidate_labels_df = ticks_df[ticks_df['Chr_order'] < NUM_CHROMS_TO_LABEL]
labels_df = candidate_labels_df[
    (candidate_labels_df['Chr_order'] == 0) | 
    ((candidate_labels_df['Chr_order'] + 1) % LABEL_EVERY_NTH_CHR == 0)
]
scaffolds_df = ticks_df[ticks_df['Chr_order'] >= NUM_CHROMS_TO_LABEL]

tick_vals = labels_df['tick_pos'].tolist()
tick_texts = labels_df['Chr'].tolist()

# Group remaining contigs into an "Other" category
if not scaffolds_df.empty:
    scaffold_tick_pos = (scaffolds_df['offset'].min() + scaffolds_df['offset'].max() + scaffolds_df['length'].iloc[-1]) / 2
    tick_vals.append(scaffold_tick_pos)
    tick_texts.append("Other")

# Set all axis labels, ticks, and limits
ax2.set_xlabel("Genomic Position", fontsize=16)
ax2.set_xticks(tick_vals)
ax2.set_xticklabels(tick_texts, rotation=45, ha='right')
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)

# Set fixed Y-axis to match the target image
shared_max_y, min_y = 10.5, -0.5

ax1.set_ylabel("-log10(p) [RDS Status]", fontsize=14)
ax1.set_ylim(min_y, shared_max_y)
ax1.grid(axis='y', color='#e0e0e0', linestyle='-')

ax2.set_ylabel("-log10(p) [GV Status]", fontsize=14)
ax2.set_ylim(min_y, shared_max_y)
ax2.grid(axis='y', color='#e0e0e0', linestyle='-')

# Add panel labels ("a", "b") without parentheses
ax1.text(0.02, 0.9, "a", transform=ax1.transAxes, fontsize=24, fontweight='bold', va='top')
ax2.text(0.02, 0.9, "b", transform=ax2.transAxes, fontsize=24, fontweight='bold', va='top')

# --- Save and Display the Figure ---
# You can uncomment the desired save format
plt.savefig("manhattan_matplotlib_output.png", dpi=300, bbox_inches='tight')
plt.savefig("manhattan_matplotlib_output.svg", bbox_inches='tight', format='svg')

plt.show()

In [None]:
import polars as pl
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- Configuration ---
P_VALUE_CUTOFF = 0.0
MARKER_SIZE = 3
GAP_FACTOR = 0.05
LABEL_EVERY_NTH_CHR = 3 # Label every 3rd chromosome to prevent overlap

# --- Dummy Data (assuming your real data is loaded) ---
# long_chrs = ...
# gwas_df = ...
# gv_gwas_df = ...

# --- Data Preparation Function (Corrected and Final) ---
def prepare_manhattan_data(df, chr_lengths_dict, p_cutoff=0, gap_factor=0.05):
    sorted_chroms_by_length = sorted(chr_lengths_dict, key=chr_lengths_dict.get, reverse=True)
    chrom_order_map = {chrom: i for i, chrom in enumerate(sorted_chroms_by_length)}
    plot_df = df.clone().with_columns(
        (-pl.col("P").log10()).alias("-log10p")
    ).filter(
        pl.col("-log10p") >= p_cutoff
    ).with_columns([
        pl.col("Chr").str.extract(r"(\d+)", 1).cast(pl.Int64).fill_null(999).alias("Chr_label"),
        pl.col("Chr").replace(chrom_order_map).cast(pl.Int64).alias("Chr_order")
    ]).sort("Chr_order", "Pos")
    sorted_chroms_present = plot_df.select("Chr", "Chr_order").unique().sort("Chr_order")["Chr"].to_list()
    chrom_lengths_sorted = [chr_lengths_dict.get(c, 0) for c in sorted_chroms_present]
    offsets_df = pl.DataFrame({
        "Chr": sorted_chroms_present, "length": chrom_lengths_sorted
    }).with_columns(
        (pl.col("length").mean() * gap_factor).cast(pl.Int64).alias("gap")
    ).with_columns(
        (pl.col("length") + pl.col("gap")).cum_sum().shift(1).fill_null(0).alias("offset")
    )
    plot_df = plot_df.join(
        offsets_df.select("Chr", "offset"), "Chr", how="left"
    ).with_columns(
        (pl.col("Pos") + pl.col("offset")).alias("x")
    )
    ticks_df = offsets_df.join(
        plot_df.group_by("Chr").agg(pl.col("Chr_label").first(), pl.col("Chr_order").first()), "Chr"
    ).with_columns(
        (pl.col("offset") + pl.col("length") / 2).alias("tick_pos")
    ).sort("Chr_order")
    return plot_df, ticks_df

# --- Prepare data for both populations ---
plot_df_whole, ticks_df = prepare_manhattan_data(gwas_df, long_chrs, P_VALUE_CUTOFF, gap_factor=GAP_FACTOR)
plot_df_gv, _ = prepare_manhattan_data(gv_gwas_df, long_chrs, P_VALUE_CUTOFF, gap_factor=GAP_FACTOR)

# --- Plotting ---
fig = make_subplots(
    rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.05, row_heights=[0.5, 0.5]
)

#  # c("#CABEE9", "#7C7189", "#FAE093", "#D04E59", "#BC8E7D", "#2F3D70")
our_colors = ["#CABEE9", "#7C7189", "#FAE093", "#D04E59", "#BC8E7D", "#2F3D70"]
colors = [our_colors[1], our_colors[4]]
for i, chrom_row in enumerate(ticks_df.iter_rows(named=True)):
    chrom = chrom_row['Chr']
    color = colors[i % len(colors)]
    df_subset_whole = plot_df_whole.filter(pl.col("Chr") == chrom)
    if not df_subset_whole.is_empty():
        fig.add_trace(go.Scatter(x=df_subset_whole["x"], y=df_subset_whole["-log10p"], mode='markers', marker=dict(color=color, size=MARKER_SIZE), name=chrom, hovertext=df_subset_whole["Marker"], showlegend=False), row=1, col=1)
    df_subset_northern = plot_df_gv.filter(pl.col("Chr") == chrom)
    if not df_subset_northern.is_empty():
        fig.add_trace(go.Scatter(x=df_subset_northern["x"], y=df_subset_northern["-log10p"], mode='markers', marker=dict(color=color, size=MARKER_SIZE), name=chrom, hovertext=df_subset_northern["Marker"], showlegend=False), row=2, col=1)
        
# --- Customize Layout and Axes ---
NUM_CHROMS_TO_LABEL = 30
candidate_labels_df = ticks_df.filter(pl.col("Chr_order") < NUM_CHROMS_TO_LABEL)
labels_df = candidate_labels_df.filter(
    (pl.col("Chr_order") == 0) | ((pl.col("Chr_order") + 1) % LABEL_EVERY_NTH_CHR == 0)
)
scaffolds_df = ticks_df.filter(pl.col("Chr_order") >= NUM_CHROMS_TO_LABEL)
tick_vals = labels_df["tick_pos"].to_list()
tick_texts = (labels_df["Chr_order"] + 1).to_list()
if not scaffolds_df.is_empty():
    scaffold_tick_pos = (scaffolds_df["tick_pos"].min() + scaffolds_df["tick_pos"].max()) / 2
    tick_vals.append(scaffold_tick_pos)
    tick_texts.append("Other")

fig.update_xaxes(
    title_text="Genomic Position",
    tickvals=tick_vals, ticktext=tick_texts, tickangle=45, row=2, col=1
)

max_y_whole = plot_df_whole["-log10p"].max() + 0.5
max_y_gv = plot_df_gv["-log10p"].max() + 0.5
min_y = P_VALUE_CUTOFF - 0.5

max_y_gv = max(max_y_whole, max_y_gv)

fig.update_yaxes(
    title_text="-log10(p) [RDS Status]", showgrid=True, gridwidth=1, gridcolor='#f0f0f0', 
    row=1, col=1, range=[min_y, max_y_whole]
)
fig.update_yaxes(
    title_text="-log10(p) [GV Status]", showgrid=True, gridwidth=1, gridcolor='#f0f0f0', 
    row=2, col=1, range=[min_y, max_y_gv]
)
fig.update_xaxes(showgrid=False)

fig.update_layout(
    #title_text="Stacked Manhattan Plots for Respiratory Disease Syndrome and Gyrovirus Presence", height=800, template="plotly_white",
    #title_font_size=20, xaxis_title_font_size=16, yaxis_title_font_size=14,
    # Disable title
    title_text="", height=600, template="plotly_white",
    margin=dict(l=50, r=30, t=90, b=80),
    font=dict(
        family="Helvetica, Arial, sans-serif",
        size=12,  # Sets a base font size for elements like tick labels
        color="black"
    ),
)


#fig.show()
# Save the figure with high resolution (300dpi, width=1200px, height=800px)
fig.write_image("manhattan_rds_status_gv_status.png", scale=3, width=720, height=480)
#fig.write_image("manhattan_rds_status_gv_status.svg", scale=3, width=1200, height=800)

In [None]:
fig.write_image("manhattan_rds_status_gv_status.svg", scale=3, width=1200, height=800)

In [None]:
from gff_eggnog_util import *

GFF_PATH = "/mnt/data/development/hoiho_working/galba/galba.converted.gff3"
EGGNOG_PATH = "/mnt/data/development/hoiho_working/galba/a9.emapper.annotations"

# Initialize
GFF_IDX = init_gff_index(GFF_PATH, feature_types=("gene","mRNA"))
EGG_MAP, EGG_ALIAS = init_eggnog_map(EGGNOG_PATH)

In [None]:
# Taking the top 1.5% of SNPs, look for genes within 2kbp of them
#top_snps = gwas_df.filter(pl.col("-log10p") >= gwas_df.select(pl.col("-log10p").quantile(0.9995)).item())
# Let's do -log10(p) >= 6.0 instead

# RDS
#top_snps = gwas_df.filter(pl.col("-log10p") >= 5.0) # 4 for indels, 5 for SNPs

# GV 
top_snps = gv_gwas_df.filter(pl.col("-log10p") >= 5.0) # 4 for indels, 5 for SNPs


print(f"Number of top SNPs {top_snps.height / gwas_df.height * 100:.6f}%: {top_snps.height}")
# What is the -log10(p) cutoff for this?
log10p_cutoff = top_snps.select(pl.col("-log10p").min()).item()
print(f"-log10(p) cutoff for top 1.5% SNPs: {log10p_cutoff:.3f}")

In [None]:
# Rank the hits (1, 2, 3, ordinal) and add that as a column, so whichever it is nearby grabs
top_snps = top_snps.with_columns(
    pl.arange(1, top_snps.height + 1).alias("Rank")
)

for i, row in enumerate(top_snps.iter_rows(named = True)):
    chrom = row["Chr"]
    pos = row["Pos"]
    pval = row["P"]
    log10p = row["-log10p"]
    rank = row["Rank"]
    marker = row["Marker"]
    
    # Get genes within 2kbp
    # df = query_region(row['Chromosome'], row['Start_bp'], row['End_bp'], feature_types=("mRNA",), how="overlap")

    nearby_genes = query_region(chrom, pos - 20000, pos + 20000, feature_types=("gene", "mRNA"), how="overlap")
    
    if nearby_genes.shape[0] == 0:
        # Print out chrom, pos, log10p, and the rest leave blank
        print(f"{chrom}\t{pos}\t{rank}\t{log10p}\tN/A\tN/A\tN/A\tN/A")
        continue
    
    # print(f"SNP {marker} at {chrom}:{pos} (p={pval:.3e}, -log10p={log10p:.3f}) has {len(nearby_genes)} nearby genes:")
    nearby_genes = pl.from_pandas(nearby_genes)
    for row in nearby_genes.iter_rows(named=True):
        # Columns: contig	start	end	strand	type	id	query_key	preferred_name	description	go_terms
        gene_id = row["id"]

        # Only keep the primary transcript
        if not gene_id.endswith(".t1"):
            continue

        # preferred name, description, go_terms
        preferred_name = row["preferred_name"] if row["preferred_name"] is not None else "N/A"
        description = row["description"] if row["description"] is not None else "N/A"
        go_terms = row["go_terms"] if row["go_terms"] is not None else "N/A"
        # Format to paste into spreadsheet as supplemental table
        # Chr, Pos, -log10(p), Gene ID, Preferred Name, Description, GO Terms 
        print(f"{chrom}\t{pos}\t{rank}\t{log10p}\t{gene_id}\t{preferred_name}\t{description}\t{go_terms}")


In [None]:
nearby_genes

# Look for Sweeps

In [None]:
# Take top 1.5% of SNPs by -log10p
top_percent_threshold = gwas_df.height * 0.015
top_snps_df = gwas_df.head(int(top_percent_threshold))
# Print shape
print("Top SNPs DF shape:", top_snps_df.shape)

In [None]:
divergence_indiv_pops = pl.read_csv("../ARG/tsinfer/divergent_regions_summary_indiv_pops.csv")
divergence_grouped_pops = pl.read_csv("../ARG/tsinfer/divergent_regions_summary_grouped_pops.csv")
#divergence_indiv_pops.head()

In [None]:
import pyranges as pr
divergence_pr = pr.PyRanges(divergence_indiv_pops.rename({"Start_bp": "Start", "End_bp": "End"}).to_pandas())
divergence_grouped_pr = pr.PyRanges(divergence_grouped_pops.rename({"Start_bp": "Start", "End_bp": "End"}).to_pandas())
top_snps_pr = pr.PyRanges(top_snps_df.rename({"Chr": "Chromosome", "Pos": "Start"}).with_columns((pl.col("Start") + 1).alias("End")).to_pandas())
overlaps = divergence_pr.join(top_snps_pr)
overlaps_df = pl.from_pandas(overlaps.df)
overlaps_grouped = divergence_grouped_pr.join(top_snps_pr)
overlaps_grouped_df = pl.from_pandas(overlaps_grouped.df)
overlaps_df


In [None]:
overlaps_grouped_df

# Older code (but still to be used!)

In [None]:
# Let's plot a qq plot of orig_df and northern_pop_only_df
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.cm as cm
from matplotlib.colors import Normalize
import matplotlib.lines as mlines

# Create a new figure
fig, ax = plt.subplots(figsize=(8, 8))

colors = ["red", "blue", "orange", "yellow"]
y_max = 0.0

for i, qq_df in enumerate([orig_df, gwas_df]):
    # Create a QQ plot
    # Calculate the expected p-values
    
    # Calculate the observed p-values
    observed_pvals = np.sort(qq_df["p"].to_numpy())
    # Remove NaN's
    observed_pvals = observed_pvals[~np.isnan(observed_pvals)]
    observed_pvals = np.sort(observed_pvals)

    n = len(observed_pvals)
    expected_pvals = np.linspace(0, 1, n+1)
    # Calculate the -log10 of the expected and observed p-values
    expected_neg_log10_pvals = -np.log10(expected_pvals[1:])
    observed_neg_log10_pvals = -np.log10(observed_pvals[0:])
    y_max = max(y_max, observed_neg_log10_pvals.max())

    # Create a scatter plot
    ax.scatter(
        expected_neg_log10_pvals,
        observed_neg_log10_pvals,
        alpha=0.5,
        color=colors[i]
    )

# Add a diagonal line
x = np.linspace(0, max(expected_neg_log10_pvals), 100)
y = x
ax.plot(
    x,
    y,
    color="red",
    linestyle="--",
    label="y=x",

)

# Add a legend
#ax.legend()
# Add labels
ax.set_xlabel("-log10(Expected p-value)")
ax.set_ylabel("-log10(Observed p-value)")
# Add a title
ax.set_title("QQ Plot of GWAS p-values")

print(y_max)


In [None]:
pvals = np.array(np.sort(gwas_df["p"].to_numpy()))
# Remove NaNs
pvals = pvals[~np.isnan(pvals)]
chisq = chi2.isf(pvals, df=1)
lambda_gc = np.median(chisq) / 0.456
print("λGC:", lambda_gc)

pvals = np.array(np.sort(orig_df["p"].to_numpy()))
# Remove NaNs
pvals = pvals[~np.isnan(pvals)]
chisq = chi2.isf(pvals, df=1)
lambda_gc = np.median(chisq) / 0.456
print("λGC:", lambda_gc)

In [None]:
gwas_df

In [None]:
# Filter NaN values for p value
orig_pdf = orig_df.to_pandas()
orig_pdf = orig_pdf[~orig_pdf["p"].isna()]

In [None]:
orig_pdf