# Location to gene

## Preparation

In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

import json
import gffutils

In [3]:
project_path = Path().resolve().parent

## Load pangenome file

In [4]:
path_pangenome = project_path / "data" / "pangenome.tsv"

In [5]:
pangenome = pd.read_csv(path_pangenome, delimiter = "\t", header = None)
pangenome.columns = ["gene", "genome", "orthogroup"]
pangenome

Unnamed: 0,gene,genome,orthogroup
0,CP002034.1_1719,GCA_000143435.1,F00691_06
1,AEBA01000134.1_40,GCA_000179475.1,F00691_06
2,NBEP01000018.1_44,GCA_002079655.1,F00691_06
3,DONQ01000011.1_12,GCA_003513805.1,F00691_06
4,AFMN01000001.1_591,GCA_000215465.2,F00691_06
...,...,...,...
10096366,CABGJU010000006.1_8,GCA_902159415.1,F05974_2
10096367,CABGJU010000006.1_9,GCA_902159415.1,F10537_1
10096368,CABGJU010000007.1_1,GCA_902159415.1,F00820_1
10096369,CABGJU010000009.1_1,GCA_902159415.1,F21391_1


## Mobility frame

### Generate gene trees

In [6]:
pangenome = pangenome.iloc[0:120,:]
pangenome

Unnamed: 0,gene,genome,orthogroup
0,CP002034.1_1719,GCA_000143435.1,F00691_06
1,AEBA01000134.1_40,GCA_000179475.1,F00691_06
2,NBEP01000018.1_44,GCA_002079655.1,F00691_06
3,DONQ01000011.1_12,GCA_003513805.1,F00691_06
4,AFMN01000001.1_591,GCA_000215465.2,F00691_06
...,...,...,...
115,QFAS01000008.1_116,GCA_003129685.1,F00268_1
116,QSTB01000004.1_26,GCA_003438595.1,F00268_1
117,QSLH01000009.1_83,GCA_003470145.1,F00268_1
118,QRMK01000024.1_24,GCA_003472615.1,F00268_1


In [7]:
gff= "/Users/lorevansantvliet/Desktop/Thesis/Project/data/legen_v4_dereplicated_gffs/GCA_000006785.2.gff"

In [7]:
for orthogroup in pangenome.orthogroup.drop_duplicates():
    ortho=pangenome.loc[pangenome.orthogroup==orthogroup]
    for genome in ortho.genome.drop_duplicates():
        print(genome)
    print(ortho)

GCA_000143435.1
GCA_000179475.1
GCA_002079655.1
GCA_003513805.1
GCA_000215465.2
GCA_000217735.2
GCA_000260335.1
GCA_000529385.1
GCA_000758365.1
GCA_001011095.1
GCA_001063855.1
GCA_001067265.1
GCA_001435955.1
GCA_001723525.1
GCA_002079325.1
GCA_002079335.1
GCA_002079365.1
GCA_002079405.1
GCA_002079425.1
GCA_002079435.1
GCA_002079465.1
GCA_002079525.1
GCA_002079595.1
GCA_002079645.1
GCA_002079705.1
GCA_002079715.1
GCA_002079745.1
GCA_002079765.1
GCA_002079785.1
GCA_002079795.1
GCA_002079905.1
GCA_002079925.1
GCA_002135095.1
GCA_002159345.1
GCA_002160855.1
GCA_002161265.1
GCA_002162055.1
GCA_002250405.1
GCA_002289615.1
GCA_002289685.1
GCA_002289725.1
GCA_002289745.1
GCA_002289795.1
GCA_002289875.1
GCA_002553925.1
GCA_002554075.1
GCA_002738245.1
GCA_002848245.1
GCA_003129685.1
GCA_003316955.1
GCA_003438595.1
GCA_003470145.1
GCA_003472615.1
GCA_900094615.1
                  gene           genome orthogroup
0      CP002034.1_1719  GCA_000143435.1  F00691_06
1    AEBA01000134.1_40  GCA_000179

In [8]:
order = pangenome["gene"].str.split("_", n = 1, expand = True)
order[1] = pd.to_numeric(order[1])
order.columns = ["contig", "order"]
order

Unnamed: 0,contig,order
0,CP002034.1,1719
1,AEBA01000134.1,40
2,NBEP01000018.1,44
3,DONQ01000011.1,12
4,AFMN01000001.1,591
...,...,...
115,QFAS01000008.1,116
116,QSTB01000004.1,26
117,QSLH01000009.1,83
118,QRMK01000024.1,24


In [9]:
full_order = order.merge(pangenome, left_index= True, right_index= True)
full_order.sort_values(by=['contig','order'])

Unnamed: 0,contig,order,gene,genome,orthogroup
69,AEBA01000075.1,15,AEBA01000075.1_15,GCA_000179475.1,F00268_1
1,AEBA01000134.1,40,AEBA01000134.1_40,GCA_000179475.1,F00691_06
4,AFMN01000001.1,591,AFMN01000001.1_591,GCA_000215465.2,F00691_06
72,AFMN01000003.1,37,AFMN01000003.1_37,GCA_000215465.2,F00268_1
5,AFOI01000002.1,167,AFOI01000002.1_167,GCA_000217735.2,F00691_06
...,...,...,...,...,...
51,QSLH01000006.1,120,QSLH01000006.1_120,GCA_003470145.1,F00691_06
117,QSLH01000009.1,83,QSLH01000009.1_83,GCA_003470145.1,F00268_1
116,QSTB01000004.1,26,QSTB01000004.1_26,GCA_003438595.1,F00268_1
50,QSTB01000006.1,106,QSTB01000006.1_106,GCA_003438595.1,F00691_06


In [13]:
full_order.loc[full_order.genome=="GCA_002079745.1"]

Unnamed: 0,contig,order,gene,genome,orthogroup
26,NBEU01000019.1,14,NBEU01000019.1_14,GCA_002079745.1,F00691_06
93,NBEU01000021.1,107,NBEU01000021.1_107,GCA_002079745.1,F00268_1


In [12]:
gffutils.create_db(gff, "GCA_000006785.2_db")

OperationalError: table features already exists

In [13]:
db = gffutils.FeatureDB(dbfn="GCA_000006785.2_db")

In [14]:
df=pd.DataFrame(columns = ['contig', 'ID', 'start', 'end', 'strand'])
df

Unnamed: 0,contig,ID,start,end,strand


In [15]:
query = db.execute("select seqid,start,end,strand,attributes from features where featuretype = 'CDS'")
result = query.fetchall()

for each in result:
    #print(each['end'])
    #df=df.append({'ID':1, 'contig':2, 'end':3, 'start':4}, ignore_index=True)
    df=df.append({'ID':json.loads(each['attributes'])['ID'][0], 'contig':each['seqid'], 'end':each['end'], 'start':each['start'], 'strand':each['strand']}, ignore_index=True)
    #print(each['seqid'])
    #print(each['start'])
    #print(each['end'])
    #print(json.loads(each['attributes'])['ID'][0])

In [16]:
df[(df.start>=1773339) & (df.end<=1786888)]

Unnamed: 0,contig,ID,start,end,strand
1697,AE004092.2,1_1698,1773449,1774594,-
1698,AE004092.2,1_1699,1775853,1776620,-
1699,AE004092.2,1_1700,1776774,1776980,+
1700,AE004092.2,1_1701,1777014,1777781,+
1701,AE004092.2,1_1702,1777791,1778414,+
1702,AE004092.2,1_1703,1778414,1778683,+
1703,AE004092.2,1_1704,1778939,1779280,+
1704,AE004092.2,1_1705,1779267,1779488,+
1705,AE004092.2,1_1706,1779491,1779682,+
1706,AE004092.2,1_1707,1779694,1780023,+


### species tree

In [1]:
genome1 = "GCA_000006785.2"

In [None]:
!rm GCA_000006785.2_db