In [None]:
import polars as pl
import plotly.express as px
from scipy.stats import chi2
import numpy as np  

In [None]:
with open("plink_roh.hom.indiv", "r") as f:
    lines = f.readlines()

# Trim excess spaces (collapse multiple ones, and remove leading/trailing)
cleaned_lines = [' '.join(line.split()) for line in lines]

# Create dataframe
data = [line.split(' ') for line in cleaned_lines]

data[0]

In [None]:
# ['FID', 'IID', 'PHE', 'NSEG', 'KB', 'KBAVG']

# datadict, Sequence, ndarray, Series, or pandas.DataFrame
#   Two-dimensional data in various forms; dict input must contain Sequences, Generators, or a range. Sequence may contain Series or other Sequences.
# Convert to dict
columns = ["FID", "IID", "PHE", "NSEG", "KB", "KBAVG"]
data_dict = {col: [] for col in columns}

for row in data[1:]:
    for col, value in zip(columns, row):
        data_dict[col].append(value)

plink_roh_df = pl.DataFrame(data_dict)
plink_roh_df.head()
# Keep only FID, and KB. Then make a new column that is KB / 1,360,000 called FROH
plink_roh_df = plink_roh_df.select([
    pl.col("FID"),
    (pl.col("KB").cast(pl.Float64) / 1_360_000).alias("FROH")
])
plink_roh_df.head()


In [None]:
phenos_df = pl.read_csv("../Hoiho_Genomes_Cleaned_Sep11_25.csv", separator="\t")
phenos_df.head()

In [None]:
# Count "RDS Status" values
phenos_df.group_by("RDS Status").agg(pl.count()).sort("RDS Status")

In [None]:
# How many in just the northern pop?
phenos_df.filter(pl.col("Population (new)") == "Northern ").group_by("RDS Status").agg(pl.count()).sort("RDS Status")

In [None]:
# Using polars, let's build a new dataframe for the phenotypes and covariates
# Let's start with the covariates as they will be the most 'exhaustive'

# phenos_df['Population (new)'].unique()
# "Northern "
# "Campbell"
# "Enderby"

# We need to go from population new column to dummy variables
# is_northern, is_campbell (is_enderby is the reference)

covars_df = phenos_df.with_columns([
    (pl.col("Population (new)") == "Northern ").cast(pl.Int8).alias("is_northern"),
    (pl.col("Population (new)") == "Campbell").cast(pl.Int8).alias("is_campbell")])

# Then is_male (female as ref)
covars_df = covars_df.with_columns([
    (pl.col("Sex (genetics)") == "Male").cast(pl.Int8).alias("is_male")
])

# Then season (S1, S2, S3, S4) but let's use S1 as ref
covars_df = covars_df.with_columns([
    (pl.col("Season") == "S2").cast(pl.Int8).alias("is_s2"),
    (pl.col("Season") == "S3").cast(pl.Int8).alias("is_s3"),
    (pl.col("Season") == "S4").cast(pl.Int8).alias("is_s4")
])

# Keep only the covariate columns (and ID, of course!)
covars_df = covars_df.select([
    "ID",
    "is_northern",
    "is_campbell",
    "is_male",
    "is_s2",
    "is_s3",
    "is_s4"
])

# Join with FROH
covars_df = covars_df.join(plink_roh_df, left_on="ID", right_on="FID", how="left")
# Save to file, as TSV
covars_df.write_csv("hoiho_covariates_15Sept2025.tsv", separator="\t")

covars_df.head()

In [None]:
# Drop is_male and the seasons (s2, s3, s4)
covars_df = covars_df.select([
    "ID",
    "is_northern",
    "is_campbell",
    "FROH"
])
covars_df.write_csv("hoiho_covariates_24Sept2025_islands_froh.tsv", separator="\t")

In [None]:
phenos_df.head()

# Let's convert the phenotypes to 1/0 (except RDS Severity, which we will keep as quantitative)
# We will use the following mappings:
# RDS Status: RDS -> 1, No -> 0, Unknown -> nan
# DS Status: Y -> 1, N -> 0, null -> nan
# 'RDS Severity Score (out of 10)' -> keep quantitative, replace null with nan
# 'GV Status (P2 P3 primers)' Pos -> 1, Neg -> 0

# This is not converting "Unknown" to nan
phenos_df = phenos_df.with_columns([
    pl.when(pl.col("RDS Status") == "RDS")
      .then(1)
      .when(pl.col("RDS Status") == "No")
      .then(0)
      .otherwise(None)
      .cast(pl.Int8)
      .alias("rds_status"),

    pl.when(pl.col("DS Status") == "Y")
      .then(1)
      .when(pl.col("DS Status") == "N")
      .then(0)
      .otherwise(None)
      .cast(pl.Int8)
      .alias("ds_status"),

    pl.col("RDS Severity Score (out of 10)")
      .cast(pl.Float32)
      .alias("rds_severity"),

    pl.when(pl.col("GV Status (P2 P3 primers)") == "Pos")
      .then(1)
      .when(pl.col("GV Status (P2 P3 primers)") == "Neg")
      .then(0)
      .otherwise(None)
      .cast(pl.Int8)
      .alias("gv_status"),
])

# Now only keep the ID and processed columns
phenos_df = phenos_df.select([
    "ID",
    "rds_status",
    "ds_status",
    "rds_severity",
    "gv_status"
])

phenos_df.head()


In [None]:
# Let's visualize rds_severity
phenos_df_filtered = phenos_df.filter(pl.col("rds_severity").is_not_null())
fig = px.histogram(phenos_df_filtered.to_pandas(), x="rds_severity", nbins=20, title="RDS Severity Distribution")
fig.show()


In [None]:
from scipy.special import erfinv
from scipy.stats import norm
import pandas as pd

def inverse_normal_transform_scipy(series: pd.Series) -> pd.Series:
    """
    Apply rank-based Inverse Normal Transformation to a pandas Series.
    Handles ties by averaging ranks. Ensures NaNs are handled correctly.
    """
    # Rank the data. Higher values get higher ranks.
    # NaNs are kept as NaNs and don't affect the ranking of other values.
    ranked = series.rank(method='average', na_option='keep')
    
    # Get the number of non-NA values
    n = series.notna().sum()
    
    # Convert ranks to quantiles
    quantiles = (ranked - 0.5) / n
    
    # Apply the inverse normal transformation (quantile function)
    return norm.ppf(quantiles)

# Apply the transformation
phenos_df = phenos_df.with_columns(
    pl.Series("rds_severity_int", inverse_normal_transform_scipy(phenos_df["rds_severity"].to_pandas()))
)


In [None]:
# Let's visualize rds_severity
phenos_df_filtered = phenos_df.filter(pl.col("rds_severity").is_not_null())
fig = px.histogram(phenos_df_filtered.to_pandas(), x="rds_severity_int", nbins=20, title="RDS Severity Distribution")
fig.show()


In [None]:
import pandas as pd
import polars as pl
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'phenos_df' is your Polars DataFrame with the 'rds_severity_int' column

# --- Convert to pandas Series for plotting ---
# Matplotlib/Scipy work best with NumPy arrays or pandas Series
y = phenos_df["rds_severity_int"].to_pandas().dropna()

# --- Create the Q-Q Plot ---
plt.style.use('seaborn-v0_8-whitegrid') # Optional: makes the plot look nice
fig, ax = plt.subplots(figsize=(8, 6))

# The core of the plot
stats.probplot(y, dist="norm", plot=ax)

# --- Add informative labels ---
ax.set_title("Q-Q Plot of Inverse Normal Transformed Severity", fontsize=16)
ax.set_xlabel("Theoretical Quantiles (Standard Normal)", fontsize=12)
ax.set_ylabel("Sample Quantiles (rds_severity_int)", fontsize=12)

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(data=phenos_df.to_pandas(), x="rds_severity_int", fill=True)
plt.title("Density Plot of Transformed Severity", fontsize=16)
plt.xlabel("rds_severity_int", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.grid(True)
plt.show()

In [None]:
# Drop rds_severity as we have the int version now
phenos_df = phenos_df.drop("rds_severity")
# Save to file, as TSV
phenos_df.write_csv("hoiho_phenotypes_15Sept2025.tsv", separator="\t")

In [None]:
# How many RDS status vs not?
phenos_df.group_by("rds_status").agg(pl.count()).sort("rds_status")
