In [1]:
import pandas as pd
from ete3 import NCBITaxa

## get picked taxid with its genetic code

In [2]:
filepath="~/altorf/genome/speciespick/picked_assembly_summary_code.csv"
df=pd.read_csv(filepath)
print(df.shape)
df.head()

(1615, 8)


Unnamed: 0,asm_name,assembly_accession,domain,ftp_basename,ftp_path,organism_name,taxid,genetic_code
0,ASM2238v1,GCF_000022385.1,archaea,GCF_000022385.1_ASM2238v1,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,Sulfolobus islandicus L.S.2.15,429572,11
1,ASM97008v1,GCF_000970085.1,archaea,GCF_000970085.1_ASM97008v1,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,Methanosarcina siciliae T4/M,1434120,11
2,ASM734v1,GCF_000007345.1,archaea,GCF_000007345.1_ASM734v1,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,Methanosarcina acetivorans C2A,188937,11
3,ASM35030v1,GCF_000350305.1,archaea,GCF_000350305.1_ASM35030v1,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,Thermoplasmatales archaeon BRNA1,1054217,11
4,ASM96990v1,GCF_000969905.1,archaea,GCF_000969905.1_ASM96990v1,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,Methanosarcina vacuolata Z-761,1434123,11


## get corresponding parent taxids

In [3]:
def get_desired_ranks(taxid, desired_ranks):
    lineage = ncbi.get_lineage(taxid)
    lineage2ranks = ncbi.get_rank(lineage)
    ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
    return {rank: ranks2lineage.get(rank, -1) for rank in desired_ranks}

ncbi = NCBITaxa()
desired_ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

dct_lst=[]
for taxid in df["taxid"]:
    dct=get_desired_ranks(taxid, desired_ranks)
    dct["taxid"]=taxid
    dct_lst.append(dct)
tax_df=pd.DataFrame(dct_lst)
tax_df=tax_df[["taxid"]+desired_ranks]
print(tax_df.shape)
tax_df.head()

(1615, 8)


Unnamed: 0,taxid,kingdom,phylum,class,order,family,genus,species
0,429572,-1,28889,183924,2281,118883,2284,43080
1,1434120,-1,28890,224756,94695,2206,2207,38027
2,188937,-1,28890,224756,94695,2206,2207,2214
3,1054217,-1,28890,183967,2301,-1,-1,1054217
4,1434123,-1,28890,224756,94695,2206,2207,2215


In [5]:
for rank in (["taxid"]+desired_ranks):
    print("{0}: {1}".format(rank,len(set(tax_df[rank]))))

taxid: 1615
kingdom: 1
phylum: 37
class: 69
order: 155
family: 314
genus: 794
species: 1577


There exists duplication even in species column, that is because taxid was not uniqully assigned for each species before.

## get AveNSRF information

In [6]:
countFilepath="~/altorf/genome/patternanalyze/summarize_count.csv"
count_df=pd.read_csv(countFilepath)
print(count_df.shape)
count_df.head()

(1615, 3)


Unnamed: 0,taxid,count_real,count_sim
0,429572,0.425893,0.427635
1,1434120,0.463257,0.442056
2,188937,0.448898,0.429019
3,1054217,1.094285,0.843878
4,1434123,0.410412,0.401882


## get compositional information

In [7]:
compFilepath="~/altorf/genome/preprocess/out/summary_composition.csv"
comp_df=pd.read_csv(compFilepath)
comp_df=comp_df[comp_df["dna_type"]=="chromosome"]#exclude plasmid
print(comp_df.shape)
comp_df.head()

(1615, 100)


Unnamed: 0,ftp_basename,dna_type,G+C,num_ambiguous,per_ambiguous,A,C,G,T,B,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
0,GCF_000022385.1_ASM2238v1,chromosome,0.35111,0,0.0,888038,475131,485602,887501,,...,14898,59802,45239,25365,35234,33745,95524,58476,47604,96167
1,GCF_000970085.1_ASM97008v1,chromosome,0.429404,0,0.0,1420708,1073050,1081510,1442290,,...,52223,96142,93249,75156,63340,72350,91433,129173,89002,192140
2,GCF_000007345.1_ASM734v1,chromosome,0.426809,0,0.0,1638004,1228410,1226378,1658700,,...,60400,112816,108481,86152,72519,82104,104484,149528,102469,220277
3,GCF_000350305.1_ASM35030v1,chromosome,0.58303,0,0.0,306724,427580,424288,302513,,...,37083,22944,22468,21107,24589,17132,5507,26971,14986,9466
4,GCF_000969905.1_ASM96990v1,chromosome,0.397795,0,0.0,1340341,902862,889505,1373044,,...,39862,94306,85991,64803,56747,65669,99640,117838,85862,191079


## merge various dataframe to out_df

In [8]:
out_df=pd.merge(tax_df, count_df, on="taxid",how="left")
out_df["diff"]=out_df["count_real"]-out_df["count_sim"]

out_df=pd.merge(out_df, df[["taxid","ftp_basename","organism_name","genetic_code"]], on="taxid",how="left")#!TBI! because the key of comp_df is not taxid bt ftp_basename...
out_df=pd.merge(out_df, comp_df[["ftp_basename","G+C"]],on="ftp_basename",how="left")
print(out_df.shape)
out_df.head()

(1615, 15)


Unnamed: 0,taxid,kingdom,phylum,class,order,family,genus,species,count_real,count_sim,diff,ftp_basename,organism_name,genetic_code,G+C
0,429572,-1,28889,183924,2281,118883,2284,43080,0.425893,0.427635,-0.001742,GCF_000022385.1_ASM2238v1,Sulfolobus islandicus L.S.2.15,11,0.35111
1,1434120,-1,28890,224756,94695,2206,2207,38027,0.463257,0.442056,0.021201,GCF_000970085.1_ASM97008v1,Methanosarcina siciliae T4/M,11,0.429404
2,188937,-1,28890,224756,94695,2206,2207,2214,0.448898,0.429019,0.019879,GCF_000007345.1_ASM734v1,Methanosarcina acetivorans C2A,11,0.426809
3,1054217,-1,28890,183967,2301,-1,-1,1054217,1.094285,0.843878,0.250406,GCF_000350305.1_ASM35030v1,Thermoplasmatales archaeon BRNA1,11,0.58303
4,1434123,-1,28890,224756,94695,2206,2207,2215,0.410412,0.401882,0.008531,GCF_000969905.1_ASM96990v1,Methanosarcina vacuolata Z-761,11,0.397795


# summarize information on each genus

In [9]:
genus_lst=list(set(tax_df["genus"]))
dct_lst=[]
for genus in genus_lst:
    if genus!=-1:
        dct={}
        dct["genus"]=genus
        dct["name"]=ncbi.get_taxid_translator([genus])[genus]
        filtered_df=out_df[out_df["genus"]==genus]
        dct["#"]=len(filtered_df)
        dct["mean_diff"]=filtered_df["diff"].mean()
        dct["mean_G+C"]=filtered_df["G+C"].mean()
        if len(filtered_df)==(filtered_df["genetic_code"]==11).sum():
            dct["genetic_code"]=11
        else:
            dct["genetic_code"]=-1
        dct_lst.append(dct)
genus_df=pd.DataFrame(dct_lst)
genus_df.head()

Unnamed: 0,#,genetic_code,genus,mean_G+C,mean_diff,name
0,1,11,2050,0.554133,0.044594,Mobiluncus
1,3,11,2053,0.673006,0.142491,Gordonia
2,1,11,6,0.67319,0.099512,Azorhizobium
3,1,11,2056,0.681094,0.033561,Sphaerobacter
4,1,11,10,0.519912,0.043912,Cellvibrio


In [11]:
genus_df=genus_df.sort_values(by=["#","mean_diff"],ascending=False)
genus_df=genus_df[["genus","name","#","genetic_code","mean_G+C","mean_diff"]]
genus_df.head(20)

Unnamed: 0,genus,name,#,genetic_code,mean_G+C,mean_diff
687,1716,Corynebacterium,35,11,0.609289,0.072668
20,2093,Mycoplasma,30,-1,0.287322,2e-05
641,1578,Lactobacillus,27,11,0.410088,-0.000174
753,1883,Streptomyces,21,11,0.722731,0.170975
568,1386,Bacillus,21,11,0.40967,0.003619
607,1485,Clostridium,20,11,0.309281,0.004572
147,286,Pseudomonas,18,11,0.629517,0.141646
706,1763,Mycobacterium,18,11,0.670261,0.103016
543,1301,Streptococcus,15,11,0.39005,-0.00012
482,1129,Synechococcus,14,11,0.555711,0.041626


In [12]:
genus_df.to_csv("../out/genus.csv",index=False)

In [15]:
targetedGenus_df=genus_df[genus_df["#"]>=5]
print(targetedGenus_df.shape)
targetedGenus_df.head()

(59, 6)


Unnamed: 0,genus,name,#,genetic_code,mean_G+C,mean_diff
687,1716,Corynebacterium,35,11,0.609289,0.072668
20,2093,Mycoplasma,30,-1,0.287322,2e-05
641,1578,Lactobacillus,27,11,0.410088,-0.000174
753,1883,Streptomyces,21,11,0.722731,0.170975
568,1386,Bacillus,21,11,0.40967,0.003619


In [16]:
for genus in targetedGenus_df["genus"]:
    (out_df[out_df["genus"]==genus]).to_csv("../out/genus_{}.txt".format((genus)),index=False)

In [17]:
targetedGenus_df["genus"].to_csv("target_genus.list",index=False)

genus_id=872, 286, 1678 seems good for test case analysis