In [1]:
import polars as pl
import pandas as pd
import re
import numpy as np
from collections import defaultdict
import portion as P
import re

In [2]:
gff3_columns = ["Landmark", "Source", "Type", "Start", "End", "Strand", "Attributes"]

gff3 = pl.read_csv("cleaned.gff3", comment_char = "#", separator="\t", has_header=False, new_columns=gff3_columns, columns=[0, 1, 2, 3, 4, 6, 8])
genes = gff3.filter(pl.col("Type") == "gene")
cds = gff3.filter(pl.col("Type") == "CDS")

# Get the gene ID
matcher = re.compile(r'GeneID:([\d]+)[,;]')

genes.head()

Landmark,Source,Type,Start,End,Strand,Attributes
str,str,str,i64,i64,str,str
"""S1""","""Gnomon""","""gene""",25222,29472,"""+""","""ID=gene-LOC115..."
"""S1""","""Gnomon""","""gene""",32484,39309,"""+""","""ID=gene-CCDC16..."
"""S1""","""Gnomon""","""gene""",41685,71311,"""+""","""ID=gene-MAPK15..."
"""S1""","""Gnomon""","""gene""",65399,90205,"""-""","""ID=gene-FAM83H..."
"""S1""","""Gnomon""","""gene""",110408,211712,"""-""","""ID=gene-SCRIB;..."


In [3]:
# Add Gene ID as a separate column

def extract(s):
    match = matcher.search(s)
    if match:
        return match.group(1)
    else:
        return ""

gene_ids = genes.apply(lambda r: extract(r[6]))
genes = genes.with_columns([
    (gene_ids[:, 0]).alias("ID"),
])
genes.head()

Landmark,Source,Type,Start,End,Strand,Attributes,ID
str,str,str,i64,i64,str,str,str
"""S1""","""Gnomon""","""gene""",25222,29472,"""+""","""ID=gene-LOC115...","""115603991"""
"""S1""","""Gnomon""","""gene""",32484,39309,"""+""","""ID=gene-CCDC16...","""115603988"""
"""S1""","""Gnomon""","""gene""",41685,71311,"""+""","""ID=gene-MAPK15...","""115603977"""
"""S1""","""Gnomon""","""gene""",65399,90205,"""-""","""ID=gene-FAM83H...","""115603962"""
"""S1""","""Gnomon""","""gene""",110408,211712,"""-""","""ID=gene-SCRIB;...","""115603994"""


In [14]:
matcher_name = re.compile(r'protein_id=(XP_[\w\d\.]+)[,;\s$]?')#[\w\d\._]+)[,;\$\s]')

def extract_names(s):
    match = matcher_name.search(s)
    if match:
        return match.group(1)
    else:
        return ""

cds_ids = cds.apply(lambda r: extract(r[6]))
names = cds.apply(lambda r: extract_names(r[6]))

cds = cds.with_columns([
    (cds_ids[:, 0]).alias("ID"),
    (names[:, 0]).alias("Name"),
])
cds.head()
cds_map = cds.select((pl.col("ID"), pl.col("Name")))
geneid_to_xp = {k: v for k, v in cds_map.iter_rows()}
cds_map.head()

ID,Name
str,str
"""115603988""","""XP_030332433.1..."
"""115603977""","""XP_030332421.1..."
"""115603977""","""XP_030332421.1..."
"""115603977""","""XP_030332421.1..."
"""115603977""","""XP_030332421.1..."


# Load up MDS1 / MDS2 outlier regions from Local PCA Analysis
See ../ANALYSIS_localpca

In [15]:
mds1 = pl.read_csv("../ANALYSIS_localpca/mds1.highlight", sep="\t", has_header=False, new_columns=["Landmark", "Start", "End"])
mds2 = pl.read_csv("../ANALYSIS_localpca/mds2.highlight", sep="\t", has_header=False, new_columns=["Landmark", "Start", "End"])

# Process Gene Names for TASSEL Outputs

In [32]:
gff = pd.read_csv(
    "/mnt/data/Kakapo/converted.sorted.gff3", comment="#", sep="\t", header=None
)
intervals = defaultdict(list)
intervals_map = defaultdict(P.IntervalDict)

#gene_name = re.compile("Name=(.+?);")
gene_name = re.compile("GeneID:(\d+?);")

for n, row in gff.iterrows():
    if row[2] == "gene":
        matched = gene_name.findall(row[8])
        if matched != None:
            intervals_map[row[0]][P.open(row[3], row[4])] = matched[0]
        else:
            intervals_map[row[0]][P.open(row[3], row[4])] = "__"

In [34]:
def test_in_gene(landmark, start, end):
    interval = P.closed(start, end)
    
    for i in intervals_map[landmark]:
        intersects = i.intersection(interval)
        if len(intersects) > 0:
            return intervals_map[landmark][intersects]
#    if x["Pos"] in intervals_map[x['Chr']]:
#        return intervals_map[x['Chr']][x["Pos"]]
    return "Intergenic"

In [55]:
mds_outliers = list()

for landmark, start, end in mds1.iter_rows():
    intersections = test_in_gene(landmark, start, end)
    if intersections != "Intergenic":
        geneid = intersections.values()[0]
        if len(geneid) > 0 and geneid in geneid_to_xp:
            mds_outliers.append(geneid_to_xp[geneid])

In [59]:
with open("mds1_outliers.txt", "w") as fh:
    for i in list(set(mds_outliers)):
        fh.write(i)
        fh.write("\n")

In [60]:
mds_outliers = list()

for landmark, start, end in mds2.iter_rows():
    intersections = test_in_gene(landmark, start, end)
    if intersections != "Intergenic":
        geneid = intersections.values()[0]
        if len(geneid) > 0 and geneid in geneid_to_xp:
            mds_outliers.append(geneid_to_xp[geneid])

In [61]:
with open("mds2_outliers.txt", "w") as fh:
    for i in list(set(mds_outliers)):
        fh.write(i)
        fh.write("\n")

# Process TASSEL Outputs

In [176]:
# file = "gwas_p3d_off_11Oct/slow_crop_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_slow_crop_stats.txt"
# file = "gwas_p3d_off_11Oct/clutch_size_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_clutch_size_stats.txt"
# file = "gwas_p3d_off_11Oct/growth_curve_means_no_min_filter_no_mds_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_weights_stats.txt"
# file = "gwas_p3d_off_11Oct/asp_offspring_pheno_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_asp_offspring_pheno_stats.txt"
# file = "gwas_p3d_off_11Oct/asp_pheno_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_asp_pheno_stats.txt"
# file = "gwas_p3d_off_11Oct/slow_crop_embedding_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_slow_crop_embedding_stats.txt"
# file = "gwas_p3d_off_11Oct/surviving_embryos_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_FertileEggs_stats.txt"
# file = "gwas_p3d_off_11Oct/surviving_embryos_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_surviving_embryos_stats.txt"
# file = "gwas_p3d_off_11Oct/egg_shape_ratio_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_egg_shape_ratio_0centered_stats.txt"

# file = "clo_gwas_p3d_off/cloacitis_offspring_pheno_filtered_+_cloacitis_offspring_pheno_stats.txt"

# file = "gwas_p3d_off_31May2022/clutch_size_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_clutch_size_new_stats.txt"
#file = "gwas_p3d_off_9JuneMay2022/clutch_size_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_clutch_size_new_randomonly_stats.txt"

# FINAL ANALYSES!!
#file = "/home/josephguhlin/development/kakapo-phenos/gwas_p3d_off_11Oct/egg_shape_ratio_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_egg_shape_ratio_0centered_stats.txt"
#phenotype = "egg shape index"

#file = "/home/josephguhlin/development/kakapo-phenos/gwas_p3d_off_5AugMay2022/clutch_size_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_clutch_size_new_randomonly_stats.txt"
#phenotype = "clutch size"

#file = "gwas_p3d_off_19Aug/growth_curve_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_weights_stats.txt"
file = "/home/josephguhlin/development/kakapo-phenos/gwas_p3d_off_11Oct/growth_curve_means_no_min_filter_no_mds_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_weights_stats.txt"
phenotype = "growth traits"

#file = "/home/josephguhlin/development/kakapo-phenos/gwas_p3d_off_11Oct/fertile_eggs_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_FertileEggs_8Aug_stats.txt"
#phenotype = "fertile eggs"

#file = "/home/josephguhlin/development/kakapo-phenos/gwas_p3d_off_12Aug/asp_offspring_pheno_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_asp_pheno_stats.txt"
#phenotype = "aspergillosis"

#file = "/home/josephguhlin/development/kakapo-phenos/gwas_p3d_off_12Aug/surviving_embryos_gwas_snps_filtered_minobs_60_minaf_0.02.hapmap_+_surviving_embryos_new_stats.txt"
#phenotype = "surviving embryos"

odata = pl.read_csv(file, sep="\t").to_pandas()
data = odata

In [183]:
#data = odata[odata["Trait"] == "b1"]
#phenotype = "Gompertz_b1"

data = odata[odata["Trait"] == "M"]
phenotype = "Gompertz_M"

#data = odata[odata["Trait"] == "b0"]
#phenotype = "Gompertz_b0"

#data = odata

In [184]:
data["-log10p"] = -np.log10(data["p"])
data["dom-log10p"] = -np.log10(data["dom_p"])
data_filtered = data[data["-log10p"] >= 3.0]
data_filtered = data_filtered.sort_values(by="-log10p", ascending=False)
data_filtered["rank"] = range(len(data_filtered))
data_dom_filtered = data[data["dom-log10p"] >= 3.0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["-log10p"] = -np.log10(data["p"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["dom-log10p"] = -np.log10(data["dom_p"])


In [185]:
sorted_data = data_filtered.sort_values("-log10p", ascending=False)[0:1000]
sorted_data["gene"] = sorted_data.apply(test_in_gene, axis=1)

sorted_data = sorted_data[["rank", "gene", "Marker", "Chr", "Pos", "p", "add_effect", "add_p", "add_F", "dom_effect", "dom_p", "dom_F", "errordf", "MarkerR2", "Genetic Var", "Residual Var", "-2LnLikelihood"]]
# sorted_data.to_csv("{}_gwas_snps.csv".format(phenotype), sep="\t", index=False)


In [186]:
#sorted_data[0:50]

In [187]:
top1k_genes = list()

for n,row in sorted_data.iterrows():
    if row['gene'] != 'Intergenic' and row['gene'] in geneid_to_xp:
        top1k_genes.append(geneid_to_xp[row['gene']])
        
len(set(top1k_genes))

124

In [188]:
with open("{}_1ksnps.txt".format(phenotype), "w") as fh:
    for i in list(set(top1k_genes)):
        fh.write(i)
        fh.write("\n")
phenotype

'Gompertz_M'

In [189]:
# data_dom_filtered[data_dom_filtered['Chr'] == 'S8'].sort_values("dom-log10p")[-20:]
#data_filtered[data_filtered["Chr"] == "S20"].sort_values("-log10p")[-50:]
#v = data_filtered.sort_values("-log10p", ascending=False)
#v["gene"] = v.apply(test_in_gene, axis=1)
#v[["rank", "gene", "Marker", "Chr", "Pos", "p", "add_effect", "add_p", "dom_effect", "dom_p",]]

In [175]:
geneid_to_xp['115615659']

'XP_030359938.1'