In [2]:
import os
import re
import scanpy as sc
import pandas as pd
import numpy as np
from Bio import SeqIO as si

### This script creates a list of 1:1:1 orthologs gene between human, mouse and bat (Rousettus aegyptiacus).
### The script takes CDS fasta file and EggNOG's run results for each species and returns a dataframe containing all genes that have an identical EggNOG gene name.

#### Loading and Extracting genes data

In [3]:
# load mouse annotation file from Eggnog run results.
mouse_annotations = pd.read_csv(r"C:\Users\TzachiHNB6\Documents\eggnog\mouse_query_92_1.fa.emapper.annotations", delimiter="\t")

In [4]:
# load human annotation file from Eggnog run results.
human_annotations  = pd.read_csv(r"C:\Users\TzachiHNB6\Documents\eggnog\human_query_92_1.fa.emapper.annotations", delimiter="\t")

In [5]:
# load bat annotation file from Eggnog run results.
bat1k_annotations = pd.read_csv(r"C:\Users\TzachiHNB6\Documents\eggnog\bat1k_query_1.fa.emapper.annotations", delimiter="\t")

In [78]:
# This function goes over all cds record and extracts record id (that will be used for intersection with Eggnog annotations file)
# gene id and gene name (if exists)
def extract_cds_data(cds_path, gene_id_regex = None, gene_name_regex = None):
    gene_ids = []
    gene_names = []
    fasta_record_ids = []
    with open(cds_path, "r") as cds:
        sequences = si.parse(cds, "fasta")
        for fasta in sequences:
            fasta_record_ids.append(fasta.id)
            if gene_id_regex != None:
                gene_ids.append(re.search(gene_id_regex, fasta.description).group(1))
            if gene_name_regex != None:
                gene_names.append(re.search(gene_name_regex, fasta.description).group(1))
    if gene_id_regex != None and gene_name_regex != None:
        return pd.DataFrame({'record_id' : fasta_record_ids, 'gene_id': gene_ids, 'gene_name': gene_names})
    if gene_id_regex != None:
        return pd.DataFrame({'record_id' : fasta_record_ids, 'gene_id': gene_ids})
    return pd.DataFrame({'record_id' : fasta_record_ids, 'gene_name': gene_names})
    

            

In [92]:
cds_path = r"C:\Users\TzachiHNB6\Documents\eggnog\genoms\mouse_92_cds\Mus_musculus.GRCm38.cds.all.fa"
mouse_data = extract_cds_data(cds_path, "gene:([^ ]+)", "gene_symbol:([^ ]+)")

In [93]:
mouse_data

Unnamed: 0,record_id,gene_id,gene_name
0,ENSMUST00000196221.1,ENSMUSG00000096749.2,Trdd1
1,ENSMUST00000177564.1,ENSMUSG00000096176.1,Trdd2
2,ENSMUST00000178537.1,ENSMUSG00000095668.1,Trbd1
3,ENSMUST00000178862.1,ENSMUSG00000094569.1,Trbd2
4,ENSMUST00000179520.1,ENSMUSG00000094028.1,Ighd4-1
...,...,...,...
65465,ENSMUST00000201577.1,ENSMUSG00000107104.3,Nrxn2
65466,ENSMUST00000201188.3,ENSMUSG00000107104.3,Nrxn2
65467,ENSMUST00000200719.1,ENSMUSG00000107104.3,Nrxn2
65468,ENSMUST00000202867.3,ENSMUSG00000107099.3,Slc22a12


In [90]:
cds_path = r"C:\Users\TzachiHNB6\Documents\eggnog\genoms\human_92_cds\Homo_sapiens.GRCh38.cds.all.fa"
human_data = extract_cds_data(cds_path, "gene:([^ ]+)", "gene_symbol:([^ ]+)")

In [91]:
human_data

Unnamed: 0,record_id,gene_id,gene_name
0,ENST00000434970.2,ENSG00000237235.2,TRDD2
1,ENST00000448914.1,ENSG00000228985.1,TRDD3
2,ENST00000415118.1,ENSG00000223997.1,TRDD1
3,ENST00000632684.1,ENSG00000282431.1,TRBD1
4,ENST00000631435.1,ENSG00000282253.1,TRBD1
...,...,...,...
107839,ENST00000645885.1,ENSG00000285075.1,TPK1
107840,ENST00000642257.1,ENSG00000285075.1,TPK1
107841,ENST00000643965.1,ENSG00000285075.1,TPK1
107842,ENST00000642712.1,ENSG00000285114.1,GSDMC


In [87]:
cds_path = r"C:\Users\TzachiHNB6\Documents\eggnog\genoms\bat_cds\cds_from_genomic.fna"
bat_data = extract_cds_data(cds_path, gene_name_regex = "gene=([^\]]+)]")

In [88]:
bat_data

Unnamed: 0,record_id,gene_name
0,lcl|NW_023416284.1_cds_XP_016000028.1_1,SAMD11
1,lcl|NW_023416284.1_cds_XP_016000029.1_2,SAMD11
2,lcl|NW_023416284.1_cds_XP_016000030.1_3,SAMD11
3,lcl|NW_023416284.1_cds_XP_016000033.2_4,NOC2L
4,lcl|NW_023416284.1_cds_XP_016000032.2_5,NOC2L
...,...,...
56471,lcl|NC_007393.1_cds_YP_313613.1_49055,ND4L
56472,lcl|NC_007393.1_cds_YP_313614.1_49056,ND4
56473,lcl|NC_007393.1_cds_YP_313615.1_49057,ND5
56474,lcl|NC_007393.1_cds_YP_313616.1_49058,ND6


#### Combine with eggnog results

In [85]:
def combine_with_eggnog(data,annotations, gene_id_col_name = "record_id"):
    for index,row in data.iterrows():
        values = annotations.loc[annotations["query"] == row[gene_id_col_name], "Preferred_name"].values
        if len(values) > 0:
            data.loc[index,"eggnog_name"] =  values[0]

In [94]:
combine_with_eggnog(bat_data, bat1k_annotations)
combine_with_eggnog(human_data, human_annotations)
combine_with_eggnog(mouse_data, mouse_annotations)

In [103]:
bat_data


Unnamed: 0,record_id,gene_name,eggnog_name
0,lcl|NW_023416284.1_cds_XP_016000028.1_1,SAMD11,SAMD11
1,lcl|NW_023416284.1_cds_XP_016000029.1_2,SAMD11,SAMD11
2,lcl|NW_023416284.1_cds_XP_016000030.1_3,SAMD11,SAMD11
3,lcl|NW_023416284.1_cds_XP_016000033.2_4,NOC2L,NOC2L
4,lcl|NW_023416284.1_cds_XP_016000032.2_5,NOC2L,NOC2L
...,...,...,...
56471,lcl|NC_007393.1_cds_YP_313613.1_49055,ND4L,ND4L
56472,lcl|NC_007393.1_cds_YP_313614.1_49056,ND4,
56473,lcl|NC_007393.1_cds_YP_313615.1_49057,ND5,ND5
56474,lcl|NC_007393.1_cds_YP_313616.1_49058,ND6,ND6


In [211]:
human_data

Unnamed: 0,record_id,gene_id,gene_name,eggnog_name
0,ENST00000434970.2,ENSG00000237235.2,TRDD2,
1,ENST00000448914.1,ENSG00000228985.1,TRDD3,
2,ENST00000415118.1,ENSG00000223997.1,TRDD1,
3,ENST00000632684.1,ENSG00000282431.1,TRBD1,
4,ENST00000631435.1,ENSG00000282253.1,TRBD1,
...,...,...,...,...
107839,ENST00000645885.1,ENSG00000285075.1,TPK1,TPK1
107840,ENST00000642257.1,ENSG00000285075.1,TPK1,TPK1
107841,ENST00000643965.1,ENSG00000285075.1,TPK1,
107842,ENST00000642712.1,ENSG00000285114.1,GSDMC,GSDMC


In [212]:
mouse_data

Unnamed: 0,record_id,gene_id,gene_name,eggnog_name
0,ENSMUST00000196221.1,ENSMUSG00000096749.2,Trdd1,
1,ENSMUST00000177564.1,ENSMUSG00000096176.1,Trdd2,
2,ENSMUST00000178537.1,ENSMUSG00000095668.1,Trbd1,
3,ENSMUST00000178862.1,ENSMUSG00000094569.1,Trbd2,
4,ENSMUST00000179520.1,ENSMUSG00000094028.1,Ighd4-1,
...,...,...,...,...
65465,ENSMUST00000201577.1,ENSMUSG00000107104.3,Nrxn2,NRXN2
65466,ENSMUST00000201188.3,ENSMUSG00000107104.3,Nrxn2,NRXN2
65467,ENSMUST00000200719.1,ENSMUSG00000107104.3,Nrxn2,NRXN2
65468,ENSMUST00000202867.3,ENSMUSG00000107099.3,Slc22a12,SLC22A12


#### Clean data

In [188]:
bat_data_copy = bat_data.copy()
human_data_copy = human_data.copy()
mouse_data_copy = mouse_data.copy()

In [189]:
def clean_data(data):
    # run upper-case on gene names and remove redundent column
    data["eggnog_name"] = [x.upper() for x in data["eggnog_name"]]
    data["gene_name"] = [x.upper() for x in data["gene_name"]]
    data.drop(columns = ["record_id"], inplace=True)
    return data

In [190]:
bat_data_copy = clean_data(bat_data_copy)
mouse_data_copy = clean_data(mouse_data_copy)
human_data_copy = clean_data(human_data_copy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["eggnog_name"] = [x.upper() for x in data["eggnog_name"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["gene_name"] = [x.upper() for x in data["gene_name"]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [191]:
bat_data_copy

Unnamed: 0,gene_name,eggnog_name
0,SAMD11,SAMD11
1,SAMD11,SAMD11
2,SAMD11,SAMD11
3,NOC2L,NOC2L
4,NOC2L,NOC2L
...,...,...
56466,COX2,COX2
56468,ATP6,ATP6
56471,ND4L,ND4L
56473,ND5,ND5


In [192]:
# remove version from gene id in human and mouse
mouse_data_copy["gene_id"] = [x.split(".")[0] for x in mouse_data_copy["gene_id"]]
human_data_copy["gene_id"] = [x.split(".")[0] for x in human_data_copy["gene_id"]]

In [193]:
# remove any gene that has multiple gene ids or has no eggnog name
def remove_multiple_gene_ids(data, gene_id_col = "gene_id"):
    data = data[(data["eggnog_name"].notnull())& (data["eggnog_name"] != "-")]
    data_grouped = data.groupby("eggnog_name").agg(set).reset_index() # group by eggnog name
    data_grouped = data_grouped[data_grouped[gene_id_col].map(len) == 1] # remove genes with more than 1 eggnog name
    data_grouped[gene_id_col] = [list(x)[0] for x in data_grouped[gene_id_col]]
    return data_grouped

In [194]:
#remove_multiple_gene_ids(bat_data_copy,"gene_name")
bat_data_copy = remove_multiple_gene_ids(bat_data_copy,"gene_name")
human_data_copy = remove_multiple_gene_ids(human_data_copy,"gene_id")
mouse_data_copy  = remove_multiple_gene_ids(mouse_data_copy,"gene_id")

In [196]:
# change the gene name from a single string set to a single string
human_data_copy["gene_name"] = [list(x)[0] for x in human_data_copy["gene_name"]]
mouse_data_copy["gene_name"] = [list(x)[0] for x in mouse_data_copy["gene_name"]]

In [205]:
#change gene_name and gene_id column names to be species specific
bat_data_copy.rename(columns={"gene_name": "bat_gene_name"}, inplace=True)
human_data_copy.rename(columns={"gene_name": "human_gene_name", "gene_id":"human_gene_id"}, inplace=True)
mouse_data_copy.rename(columns={"gene_name": "mouse_gene_name", "gene_id":"mouse_gene_id"}, inplace=True)

In [208]:
mouse_data_copy

Unnamed: 0,eggnog_name,mouse_gene_id,mouse_gene_name
0,A1BG,ENSMUSG00000022347,A1BG
1,A1CF,ENSMUSG00000052595,A1CF
2,A2M,ENSMUSG00000030111,A2M
3,A3GALT2,ENSMUSG00000028794,A3GALT2
4,A4GALT,ENSMUSG00000047878,A4GALT
...,...,...,...
17466,ZYG11A,ENSMUSG00000034645,ZYG11A
17467,ZYG11B,ENSMUSG00000034636,ZYG11B
17468,ZYX,ENSMUSG00000029860,ZYX
17469,ZZEF1,ENSMUSG00000055670,ZZEF1


#### Creating 1:1:1 results file

In [209]:
# merge 3 genes datasets mkeeping only genes that contain an eggnog_name in all species, to get only genes that are 1:1:1 orthologs according to EggNOG DB
merged = pd.merge(pd.merge(bat_data_copy,mouse_data_copy,on='eggnog_name', how="inner"),human_data_copy,on='eggnog_name', how="inner")


In [210]:
merged

Unnamed: 0,eggnog_name,bat_gene_name,mouse_gene_id,mouse_gene_name,human_gene_id,human_gene_name
0,A1BG,A1BG,ENSMUSG00000022347,A1BG,ENSG00000121410,A1BG
1,A1CF,A1CF,ENSMUSG00000052595,A1CF,ENSG00000148584,A1CF
2,A2M,LOC107501662,ENSMUSG00000030111,A2M,ENSG00000175899,A2M
3,A3GALT2,A3GALT2,ENSMUSG00000028794,A3GALT2,ENSG00000184389,A3GALT2
4,A4GALT,A4GALT,ENSMUSG00000047878,A4GALT,ENSG00000128274,A4GALT
...,...,...,...,...,...,...
13947,ZXDC,ZXDC,ENSMUSG00000034430,ZXDC,ENSG00000070476,ZXDC
13948,ZYG11A,ZYG11A,ENSMUSG00000034645,ZYG11A,ENSG00000203995,ZYG11A
13949,ZYG11B,ZYG11B,ENSMUSG00000034636,ZYG11B,ENSG00000162378,ZYG11B
13950,ZZEF1,ZZEF1,ENSMUSG00000055670,ZZEF1,ENSG00000074755,ZZEF1


In [183]:
# save 1:1:1 orthologs to file
merged.to_csv(r"bat_mouse_human_integration\bat_mouse_human_orthologs.csv")