# ** whole genome exon v intron mut *without* tabix ** 

In [93]:
from tqdm import tqdm
import copy 
import json 
import time

In [2]:
!zcat Homo_sapiens.GRCh38.100.gff3.gz | head -n 10000 | tail -n 10 

1	.	biological_region	2228324	2228334	0.999	-	.	logic_name=eponine
1	.	biological_region	2228340	2228342	0.999	-	.	logic_name=eponine
1	.	biological_region	2228346	2228356	0.999	-	.	logic_name=eponine
1	.	biological_region	2228392	2228407	0.999	-	.	logic_name=eponine
1	.	biological_region	2228433	2228440	0.999	-	.	logic_name=eponine
1	.	biological_region	2228449	2228460	0.999	+	.	logic_name=eponine
1	.	biological_region	2228478	2228480	0.999	+	.	logic_name=eponine
1	.	biological_region	2228488	2228496	0.999	+	.	logic_name=eponine
1	.	biological_region	2228503	2228510	0.999	-	.	logic_name=eponine
1	.	biological_region	2228550	2228556	0.999	-	.	logic_name=eponine

gzip: stdout: Broken pipe


In [3]:
gff_lines = open("Homo_sapiens_sorted.GRCh38.100.gff3").readlines()

In [4]:
len(gff_lines)

2984635

In [5]:
print((gff_lines[330790].split()))

['1', 'havana', 'exon', '161217986', '161218077', '.', '+', '.', 'Parent=transcript:ENST00000367992;Name=ENSE00001041693;constitutive=0;ensembl_end_phase=0;ensembl_phase=1;exon_id=ENSE00001041693;rank=2;version=1']


In [6]:
line_types = []
for line in gff_lines: 
    if len(line.split("\t")) == 9: 
        line_type = line.split()[2]
        if line_type not in line_types: 
            line_types.append(line_type)

    

In [7]:
line_types

['biological_region',
 'scaffold',
 'CDS',
 'exon',
 'five_prime_UTR',
 'gene',
 'mRNA',
 'three_prime_UTR',
 'ncRNA',
 'ncRNA_gene',
 'snRNA',
 'rRNA',
 'pseudogene',
 'pseudogenic_transcript',
 'chromosome',
 'tRNA',
 'lnc_RNA',
 'scRNA',
 'snoRNA',
 'unconfirmed_transcript',
 'miRNA',
 'V_gene_segment',
 'C_gene_segment',
 'J_gene_segment',
 'vaultRNA_primary_transcript',
 'D_gene_segment']

In [8]:
!zcat Homo_sapiens.GRCh38.100.gff3.gz | grep "CDS" > CDS_GRCh38_fromOGfileUnsorted.txt

In [9]:
cds_lines = open("CDS_GRCh38_fromOGfileUnsorted.txt").readlines()

In [10]:
cds_lines[0]

'1\thavana\tCDS\t65565\t65573\t.\t+\t0\tID=CDS:ENSP00000493376;Parent=transcript:ENST00000641515;protein_id=ENSP00000493376\n'

In [42]:
cds_bounds = {}
for line in cds_lines: 
    line_split = line.split()
    cds_type = line_split[2]
    
    if cds_type == "CDS": 
        try: 
            int(line_split[0]) # we only want locations that have chromosome assignments 
        except: 
            None
        else: 
            cds_chrom = int(line_split[0])
            cds_start = int(line_split[3])
            cds_end = int(line_split[4])
            cds_id = line_split[8].split(";")[1]

            if cds_id not in cds_bounds.keys(): 
                cds_bounds[cds_id] = {"chrom":cds_chrom, "bounds":[[cds_start, cds_end]]}
            else: 
                cds_bounds[cds_id]["bounds"].append([cds_start, cds_end])


In [46]:
print(cds_bounds["Parent=transcript:ENST00000641515"], "total dict len = "+str(len(cds_bounds)), )

{'bounds': [[65565, 65573], [69037, 70008]], 'chrom': 1} total dict len = 97015


** testing for overlap within trnascripts** 

In [47]:
for transcript_name, cds_dict in tqdm(cds_bounds.items()): 
    for index, bound in enumerate(cds_dict["bounds"]): 
        if index != 0: 
            cds_start = (bound[0])
            cds_end = (bound[1])

            other_cds_bounds = copy.copy(cds_dict["bounds"])
            other_cds_bounds.remove(bound)
            
            for other_bound in other_cds_bounds: 
                if other_bound[0] <= cds_start <= other_bound[1]: 
                    print(bound,other_bound)
                elif other_bound[0] <= cds_end <= other_bound[1]:
                    print(bound,other_bound)
                

100%|██████████| 97015/97015 [00:04<00:00, 22088.29it/s]


In [16]:
# --> ok so no overlap within trnascript IDs... 

** adding max bounds for overlap comp later ** 

In [52]:
for cds_dict_value in cds_bounds.values(): 
    max_bound = max(cds_dict_value["bounds"])[1]
    min_bound = min(cds_dict_value["bounds"])[0]
    cds_dict_value["total_bounds"] = [min_bound, max_bound]

In [53]:
print(cds_bounds["Parent=transcript:ENST00000641515"], "total dict len = "+str(len(cds_bounds)), )

{'total_bounds': [65565, 70008], 'bounds': [[65565, 65573], [69037, 70008]], 'chrom': 1} total dict len = 97015


** finding overlapping cds names  ** 

In [None]:
matching_name_list = []
listed_already_covered_cds_names = []

for cds_name, cds_dict in tqdm(cds_bounds.items()): 
    if cds_name not in listed_already_covered_cds_names: 
        total_start = cds_dict["total_bounds"][0]
        total_end = cds_dict["total_bounds"][1]
        list_cdsName_matches = []
        
        for other_cds_name, other_cds_dict in cds_bounds.items(): 
            other_total_start = other_cds_dict["total_bounds"][0]
            other_total_end = other_cds_dict["total_bounds"][1]
            
            if cds_dict["chrom"] == other_cds_dict["chrom"]:

                if  other_total_start <= total_start <= other_total_end: 
                    list_cdsName_matches.append(other_cds_name)
                elif other_total_start <= total_end <= other_total_end:
                    list_cdsName_matches.append(other_cds_name)
        
        matching_name_list.append(list_cdsName_matches) 
        listed_already_covered_cds_names.extend(list_cdsName_matches)
        
        
            

  0%|          | 48/97015 [00:07<3:59:31,  6.75it/s]

In [328]:
(listed_already_covered_cds_names)[0:10]

['Parent=transcript:ENST00000554741',
 'Parent=transcript:ENST00000397528',
 'Parent=transcript:ENST00000556287',
 'Parent=transcript:ENST00000555959',
 'Parent=transcript:ENST00000555251',
 'Parent=transcript:ENST00000674313',
 'Parent=transcript:ENST00000557129',
 'Parent=transcript:ENST00000555702',
 'Parent=transcript:ENST00000397532',
 'Parent=transcript:ENST00000397529']

In [326]:
len(cds_bounds)

97015

In [321]:
print("the number of genes are: " + str(len(matching_name_list)))

the number of genes are: 27373


In [322]:
matching_name_list[0]

['Parent=transcript:ENST00000554741',
 'Parent=transcript:ENST00000397528',
 'Parent=transcript:ENST00000556287',
 'Parent=transcript:ENST00000555959',
 'Parent=transcript:ENST00000555251',
 'Parent=transcript:ENST00000674313',
 'Parent=transcript:ENST00000557129',
 'Parent=transcript:ENST00000555702',
 'Parent=transcript:ENST00000397532',
 'Parent=transcript:ENST00000397529',
 'Parent=transcript:ENST00000554758',
 'Parent=transcript:ENST00000557629',
 'Parent=transcript:ENST00000285850',
 'Parent=transcript:ENST00000488800',
 'Parent=transcript:ENST00000555911']

** for each matching group, find overlapping exons/cds and make the max bounds for each overlap item in new non-overlap dict ** 

In [171]:
gene_group_dict = {}


In [175]:
matching_name_list[100]

['Parent=transcript:ENST00000637002',
 'Parent=transcript:ENST00000347310',
 'Parent=transcript:ENST00000395227',
 'Parent=transcript:ENST00000425614']

In [176]:
gene_group_dict = {}

for group_gene_matches in tqdm(matching_name_list): 
    
    gene_matches = [] # will be alist of lists, where each element is a list of a group of overlapping cds/exons 
    match_names_tested = [] # want to collect the transcripts I test as I go along 

    for match_name in group_gene_matches: 

        match_names_tested.append(match_name) # addign currnet cds/transcript to the list of those laready tested 

        for bound in cds_bounds[match_name]["bounds"]: 
            bound_start = bound[0]
            bound_end = bound[1]

            current_matching_bounds = [bound]

            for other_match_name in group_gene_matches: 



                if other_match_name not in match_names_tested: 
    #                 print(match_name, bound, other_match_name, len(cds_bounds[other_match_name]["bounds"]))

                    match_found = False

                    for other_bound in cds_bounds[other_match_name]["bounds"]:

                        if match_found == False: # match_found out here as one match per match name 

                            if other_bound[0]<= bound[0] <= other_bound[1] : #here are the tests for overlap 
                                current_matching_bounds.append(other_bound)
                                match_found = True
                            elif other_bound[0]<= bound[1] <= other_bound[1] :
                                current_matching_bounds.append(other_bound)
                                match_found = True                            
                            elif bound[0]<= other_bound[0] <= bound[1] :
                                current_matching_bounds.append(other_bound)
                                match_found = True                            
                            elif bound[0]<= other_bound[0] <= bound[1] : 
                                current_matching_bounds.append(other_bound)
                                match_found = True

            gene_matches.append(current_matching_bounds)

        # then if found a match ina matchname then skip rest ---> 
        # ok need to not revist old pairs 
        # need if already in a match, then skip 
      # ok now need to store current_matching_bounds

    gene_group_dict[match_name] = gene_matches

    

100%|██████████| 27373/27373 [00:50<00:00, 540.43it/s] 


In [323]:
gene_group_dict['Parent=transcript:ENST00000425614']

[[[67200855, 67200897], [67200737, 67200897]],
 [[67206910, 67207055], [67206910, 67207055]],
 [[67219574, 67219730],
  [67219574, 67219730],
  [67219574, 67219730],
  [67219574, 67219730]],
 [[67236713, 67236802], [67236713, 67236802], [67236713, 67236802]],
 [[67240179, 67240281], [67240179, 67240281], [67240179, 67240281]],
 [[67255837, 67255927],
  [67255837, 67255862],
  [67255837, 67255927],
  [67255837, 67255927]],
 [[67258478, 67259128], [67258478, 67259128], [67258478, 67259128]],
 [[67207666, 67207698], [67207666, 67207698]],
 [[67219574, 67219730], [67219574, 67219730], [67219574, 67219730]],
 [[67255837, 67255862], [67255837, 67255927], [67255837, 67255927]],
 [[67168121, 67168190]],
 [[67169342, 67169638]],
 [[67182836, 67182959]],
 [[67200737, 67200897]],
 [[67206910, 67207055]],
 [[67219574, 67219730], [67219574, 67219730]],
 [[67236713, 67236802], [67236713, 67236802]],
 [[67240179, 67240281], [67240179, 67240281]],
 [[67255837, 67255927], [67255837, 67255927]],
 [[6725

## ** ok lets get non-overlap dict ** 

In [295]:
nonOverlap_dict = {}
f_ups = []
reversed_fups = []

for gene_name in tqdm(gene_group_dict.keys()):

    #turning list of bound lists into just one long sorted listed exon bounds 
    bound_lists = []
    for bound_list in gene_group_dict[gene_name]: 
        bound_lists.extend(bound_list)    
    bound_lists = sorted(bound_lists , key=lambda k: [k[1], k[0]]) # sort 

    #setting loop variables for this gene 
    nonOverlap_list = []
    chrom = cds_bounds[gene_name]["chrom"]

    #this is the bound start/end that the rest compare to (easier to set this as the [0] outside the loop)
    bound_to_keep_start = bound_lists[0][0]
    bound_to_keep_end = bound_lists[0][1]

    #ooping through the rest fo the bounds to collect non-overlaps and extend 
    for i,bound in enumerate(bound_lists): 
        bound_start = bound[0]
        bound_end = bound[1]

        #there are some fuckups where the bound start is after the end (skip)
        if bound_start <= bound_end: 
            reversed_fups.append([gene_name, bound])

        #if the current bound_start is after the bound_to_keep_end then its a new non-overlap
        #-->need to add the prev to_keep to the list, and start another 
        elif bound_start > bound_to_keep_end: #if the 
            nonOverlap_list.append([bound_to_keep_start, bound_to_keep_end])
            bound_to_keep_start = bound_start
            bound_to_keep_end = bound_end

        #if the start isnt after, but the end is after, then need to extend the current to_keep until the end of this curr end 
        elif bound_end > bound_to_keep_end: 
            bound_to_keep_end = bound_end    

        #the only toher case there should be are repeat exon coords 
        else:
            #if not a repeat, send to error list 
            if bound_start != bound_to_keep_start or bound_end != bound_to_keep_end: 
                f_ups.append(bound)

    #once reaches end, need to add the last exon to the list             
    nonOverlap_list.append([bound_to_keep_start, bound_to_keep_end]) 
    #add all the list and chom to the dict 
    nonOverlap_dict[gene_name] = {"chrom": chrom, "exons": nonOverlap_list}

100%|██████████| 22296/22296 [00:09<00:00, 2406.97it/s]


In [309]:
len(nonOverlap_dict)

22296

In [297]:
len(f_ups)

0

In [298]:
len(reversed_fups)

3284793

In [299]:
total_exons = 0
for gene_name, gene_dict in nonOverlap_dict.items(): 
    total_exons += len(gene_dict["exons"])

In [310]:
total_exons

22296

In [308]:
(nonOverlap_dict['Parent=transcript:ENST00000425614']["exons"])

[[67168121, 67168190]]

In [311]:
list(nonOverlap_dict.keys())[190:191]

['Parent=transcript:ENST00000377745']

In [314]:
list(cds_bounds.keys())[0:10]

['Parent=transcript:ENST00000554741',
 'Parent=transcript:ENST00000296370',
 'Parent=transcript:ENST00000374726',
 'Parent=transcript:ENST00000358465',
 'Parent=transcript:ENST00000648156',
 'Parent=transcript:ENST00000532317',
 'Parent=transcript:ENST00000330843',
 'Parent=transcript:ENST00000552509',
 'Parent=transcript:ENST00000556483',
 'Parent=transcript:ENST00000534397']

In [324]:
gene_group_dict['Parent=transcript:ENST00000296370']

[[[6693933, 6694070]], [[6696893, 6697042]]]

In [317]:
nonOverlap_dict['Parent=transcript:ENST00000296370']

{'chrom': 4, 'exons': [[6693933, 6694070]]}

** ok now print the dict to file** 

In [234]:
text_file = open("CDS_maxBounds_dict.txt", "w")
n = text_file.write(json.dumps(cds_max_bounds))
text_file.close()

# ** APPENDIX !!!!!!!!!!!!!!!!!!!!!** 
___

In [106]:
tmp_list = genes_dict['Parent=transcript:ENST00000546337'][1:]
for index, element in enumerate(tmp_list): 
    if index != 0: 
        print(element[0]-tmp_list[index-1][1])

73
109
817
117


In [105]:
genes_dict['Parent=transcript:ENST00000546337'][1:]

[[143792609, 143792635],
 [143792708, 143792867],
 [143792976, 143793083],
 [143793900, 143793962],
 [143794079, 143794236]]

In [96]:
for index, element in enumerate([1,2,3,4,5][1:]): 
    print(index)

0
1
2
3


In [87]:
tmp_list_copy

[1, 4, 3, 5, 6]

In [93]:
for index, item in genes_dict.items(): 
    print(len(item))
    if index == 5: 
        break 

ValueError: too many values to unpack (expected 2)

## ** when i tried to use exons (intron and integenic comp ) ** 

In [20]:
!zcat Homo_sapiens.GRCh38.100.gff3.gz | grep "exon" > Exons_GRCh38_fromOGfileUnsorted.txt

In [130]:
!head -n 1500 Exons_GRCh38_fromOGfileUnsorted.txt | tail -n 10

1	havana	exon	1256045	1256125	.	-	.	Parent=transcript:ENST00000473215;Name=ENSE00003466253;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003466253;rank=6;version=1
1	havana	exon	1256992	1257130	.	-	.	Parent=transcript:ENST00000473215;Name=ENSE00003609795;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003609795;rank=5;version=1
1	havana	exon	1257208	1257310	.	-	.	Parent=transcript:ENST00000473215;Name=ENSE00003628299;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003628299;rank=4;version=1
1	havana	exon	1263346	1263386	.	-	.	Parent=transcript:ENST00000473215;Name=ENSE00003572926;constitutive=0;ensembl_end_phase=-1;ensembl_phase=-1;exon_id=ENSE00003572926;rank=3;version=1
1	havana	exon	1267733	1267992	.	-	.	Parent=transcript:ENST00000473215;Name=ENSE00003759840;constitutive=0;ensembl_end_phase=-1;ensembl_phase=0;exon_id=ENSE00003759840;rank=2;version=1
1	havana	exon	1273666	1273849	.	-	.	Parent=transcript:ENST00000473215;Name=E

In [23]:
exon_lines = open("Exons_GRCh38_fromOGfileUnsorted.txt").readlines()

In [27]:
exon_chrom = exon_lines[0].split()[0]
exon_start = exon_lines[0].split()[3]
exon_end = exon_lines[0].split()[4]
exon_id = exon_lines[0].split()[8].split(";")[0]

'Parent=transcript:ENST00000456328'

In [63]:
list_fuck_ups = []
for line in tqdm(exon_lines): 
    if line.split()[2] != "exon": 
        list_fuck_ups.append(line)
    else: 
        try: 
            int(line.split()[0])
        except: 
            list_fuck_ups.append(line)

100%|██████████| 1378303/1378303 [00:02<00:00, 533068.20it/s]


In [60]:
try:
    int("1")
except: 
    None
else: 
    print("yaya")



yaya


In [64]:
len((list_fuck_ups))

47289

In [128]:
genes_dict = {}
#adding exon coords to gene dict 
for line in tqdm(exon_lines): 
    line_split = line.split()
    if line_split[2] == "exon": 
        try: 
            int(line.split()[0]) # we only want locations that have chromosome assignments 
        except: 
            None
        else: 
            exon_id = line_split[8].split(";")[0] # ---> using the item ID in the final coloumn... suppposed to be "unique" but 200k of them 
            exon_start = int(line_split[3])
            exon_end = int(line_split[4])

            if exon_id in genes_dict.keys(): 
                genes_dict[exon_id]["exons"].append([exon_start, exon_end])
            else: 
                exon_chrom = int(line_split[0])
                genes_dict[exon_id] = {"chrom": exon_chrom, "exons":[[exon_start, exon_end]]}

#adding intron coords and start/stop of whole gene 
for curr_gene_dict in tqdm(genes_dict.values()): 
    
    #start stop coords for whole gene 
    gene_start = curr_gene_dict["exons"][0][0]
    gene_end = curr_gene_dict["exons"][-1][1]
    curr_gene_dict["gene_coords"] = [gene_start, gene_end]
    
    #adding intron coords
    for index, curr_exon_coords in enumerate(curr_gene_dict["exons"]): 
        if index != 0: 
            intron_start = int(curr_gene_dict["exons"][index-1][1])+1
            intron_end = int(curr_exon_coords[0]) -1
            
            if "introns" in curr_gene_dict.keys(): 
                curr_gene_dict["introns"].append([intron_start, intron_end])
            else: 
                curr_gene_dict["introns"] = [[intron_start, intron_end]]

100%|██████████| 1378303/1378303 [00:11<00:00, 117307.93it/s]
100%|██████████| 220011/220011 [00:04<00:00, 51165.92it/s]


In [125]:
len(genes_dict)

220011

In [131]:
genes_dict["Parent=transcript:ENST00000473215"]

{'chrom': 1,
 'exons': [[1255268, 1255487],
  [1256045, 1256125],
  [1256992, 1257130],
  [1257208, 1257310],
  [1263346, 1263386],
  [1267733, 1267992],
  [1273666, 1273849]],
 'gene_coords': [1255268, 1273849],
 'introns': [[1255488, 1256044],
  [1256126, 1256991],
  [1257131, 1257207],
  [1257311, 1263345],
  [1263387, 1267732],
  [1267993, 1273665]]}

** testing if there are overlapping exons ** 

In [136]:
#tmp dict to test overlap check 
#tmp_gene_dict = {"gene_1":{"exons":[[121,154], [151,200], [220,280]]}, "gene_2":{"exons":[[121,154], [155,200], [220,280]]}}

list_out_of_order = []
for gene_name, curr_gene_dict in tqdm(genes_dict.items()): 
    curr_gene_coords = []
    for exon_coords in curr_gene_dict["exons"]: 
        exon_start = exon_coords[0]
        exon_end = exon_coords[1]
        curr_gene_coords.extend([exon_end, exon_start])
    
    #checking if in order 
    order_check = 0
    for index, element in enumerate(curr_gene_dict["exons"]): 
        if index != 0: 
            if element[0] < curr_gene_dict["exons"][index-1][1]: 
                order_check += 1
    if order_check > 0 : 
        list_out_of_order.append(gene_name)

print("number of genes with atleast one overlapping exon is " +str(len(list_out_of_order)))

100%|██████████| 220011/220011 [00:01<00:00, 120995.55it/s]

number of genes with atleast one overlapping exon is 0





** analyzing gaps b/t genes ** 

In [140]:
tmp_genes_startEnd = {}
for curr_gene_dict in tqdm(genes_dict.values()): 
    curr_chrom = str(curr_gene_dict["chrom"])
    curr_gene_start = curr_gene_dict["gene_coords"][0]
    curr_gene_end = curr_gene_dict["gene_coords"][1]
    if curr_chrom in tmp_genes_startEnd.keys(): 
        tmp_genes_startEnd[curr_chrom].extend([[curr_gene_start, curr_gene_end]])
    else: 
        tmp_genes_startEnd[curr_chrom] = [[curr_gene_start, curr_gene_end]]
        

100%|██████████| 220011/220011 [00:02<00:00, 98242.05it/s] 


In [141]:
tmp_genes_startEnd["1"][0:10]

[[159178473, 159178559],
 [101639548, 101787382],
 [65148203, 65232145],
 [168464121, 168495650],
 [161505457, 161509899],
 [109656117, 109657671],
 [85133794, 85133873],
 [182899865, 182953166],
 [185518651, 185520986],
 [1399527, 1402046]]

In [161]:
for gene_coords in tmp_genes_startEnd["1"]: 
    gene_start = gene_coords[0]
    gene_end = gene_coords[1]
    other_gene_coords = copy.copy(tmp_genes_startEnd["1"])
    other_gene_coords.remove(gene_coords)
    for other_coords in other_gene_coords: 
        if other_coords[0] <= gene_start <= other_coords[1]: 
            #print(gene_coords,other_coords)
        elif other_coords[0] <= gene_start <= other_coords[1]: 
            #print(gene_coords,other_coords)
            

IndentationError: expected an indented block (<ipython-input-161-fa27a30cd73f>, line 9)

** checking out wtf the exon/gene/cds labels mean ** 

In [149]:
!head CDS_GRCh38_fromOGfileUnsorted.txt 

1	havana	CDS	65565	65573	.	+	0	ID=CDS:ENSP00000493376;Parent=transcript:ENST00000641515;protein_id=ENSP00000493376
1	havana	CDS	69037	70008	.	+	0	ID=CDS:ENSP00000493376;Parent=transcript:ENST00000641515;protein_id=ENSP00000493376
1	ensembl	mRNA	69055	70108	.	+	.	ID=transcript:ENST00000335137;Parent=gene:ENSG00000186092;Name=OR4F5-201;biotype=protein_coding;ccdsid=CCDS30547.1;tag=basic;transcript_id=ENST00000335137;transcript_support_level=NA (assigned to previous version 3);version=4
1	ensembl	CDS	69091	70008	.	+	0	ID=CDS:ENSP00000334393;Parent=transcript:ENST00000335137;protein_id=ENSP00000334393
1	ensembl_havana	mRNA	450703	451697	.	-	.	ID=transcript:ENST00000426406;Parent=gene:ENSG00000284733;Name=OR4F29-201;biotype=protein_coding;ccdsid=CCDS72675.1;tag=basic;transcript_id=ENST00000426406;transcript_support_level=NA (assigned to previous version 2);version=3
1	ensembl_havana	CDS	450740	451678	.	-	0	ID=CDS:ENSP00000409316;Parent=transcript:ENST00000426406;protein_id=ENSP00000409316
1

In [150]:
cds_lines = open("CDS_GRCh38_fromOGfileUnsorted.txt").readlines()

In [151]:
len(cds_lines)

807396

In [160]:
!zcat Homo_sapiens.GRCh38.100.gff3.gz | grep "ENSP00000411579"

1	havana	CDS	924432	924948	.	+	0	ID=CDS:ENSP00000411579;Parent=transcript:ENST00000420190;protein_id=ENSP00000411579
1	havana	CDS	925922	926013	.	+	2	ID=CDS:ENSP00000411579;Parent=transcript:ENST00000420190;protein_id=ENSP00000411579
1	havana	CDS	930155	930336	.	+	0	ID=CDS:ENSP00000411579;Parent=transcript:ENST00000420190;protein_id=ENSP00000411579
1	havana	CDS	931039	931089	.	+	1	ID=CDS:ENSP00000411579;Parent=transcript:ENST00000420190;protein_id=ENSP00000411579
1	havana	CDS	935772	935896	.	+	1	ID=CDS:ENSP00000411579;Parent=transcript:ENST00000420190;protein_id=ENSP00000411579
1	havana	CDS	939040	939129	.	+	2	ID=CDS:ENSP00000411579;Parent=transcript:ENST00000420190;protein_id=ENSP00000411579
1	havana	CDS	939275	939291	.	+	2	ID=CDS:ENSP00000411579;Parent=transcript:ENST00000420190;protein_id=ENSP00000411579


In [158]:
!zcat Homo_sapiens.GRCh38.100.gff3.gz | grep "ENST00000420190"

1	havana	mRNA	923928	939291	.	+	.	ID=transcript:ENST00000420190;Parent=gene:ENSG00000187634;Name=SAMD11-203;biotype=protein_coding;transcript_id=ENST00000420190;transcript_support_level=3;version=6
1	havana	five_prime_UTR	923928	924431	.	+	.	Parent=transcript:ENST00000420190
1	havana	exon	923928	924948	.	+	.	Parent=transcript:ENST00000420190;Name=ENSE00001637883;constitutive=0;ensembl_end_phase=1;ensembl_phase=-1;exon_id=ENSE00001637883;rank=1;version=2
1	havana	CDS	924432	924948	.	+	0	ID=CDS:ENSP00000411579;Parent=transcript:ENST00000420190;protein_id=ENSP00000411579
1	havana	exon	925922	926013	.	+	.	Parent=transcript:ENST00000420190;Name=ENSE00003794726;constitutive=0;ensembl_end_phase=0;ensembl_phase=1;exon_id=ENSE00003794726;rank=2;version=1
1	havana	CDS	925922	926013	.	+	2	ID=CDS:ENSP00000411579;Parent=transcript:ENST00000420190;protein_id=ENSP00000411579
1	havana	exon	930155	930336	.	+	.	Parent=transcript:ENST00000420190;Name=ENSE00002727207;constitutive=0;ensembl_end_phase=2;ens

In [159]:
!zcat Homo_sapiens.GRCh38.100.gff3.gz | grep "ENSG00000187634"

1	ensembl_havana	gene	923928	944581	.	+	.	ID=gene:ENSG00000187634;Name=SAMD11;biotype=protein_coding;description=sterile alpha motif domain containing 11 [Source:HGNC Symbol%3BAcc:HGNC:28706];gene_id=ENSG00000187634;logic_name=ensembl_havana_gene_homo_sapiens;version=12
1	havana	mRNA	923928	939291	.	+	.	ID=transcript:ENST00000420190;Parent=gene:ENSG00000187634;Name=SAMD11-203;biotype=protein_coding;transcript_id=ENST00000420190;transcript_support_level=3;version=6
1	havana	mRNA	925150	935793	.	+	.	ID=transcript:ENST00000437963;Parent=gene:ENSG00000187634;Name=SAMD11-204;biotype=protein_coding;transcript_id=ENST00000437963;transcript_support_level=5;version=5
1	havana	mRNA	925731	944574	.	+	.	ID=transcript:ENST00000342066;Parent=gene:ENSG00000187634;Name=SAMD11-202;biotype=protein_coding;ccdsid=CCDS2.2;tag=basic;transcript_id=ENST00000342066;transcript_support_level=5 (assigned to previous version 7);version=8
1	ensembl	mRNA	925741	944581	.	+	.	ID=transcript:ENST00000618181;Parent=gene:

In [169]:
for bound in cds_bounds: 
    cds_start = (bound[0])
    cds_end = (bound[1])
    
    other_cds_bounds = copy.copy(cds_bounds)
    other_cds_bounds.remove(bound)
    
    for other_bound in other_cds_bounds: 
        if other_bound[0] <= cds_start <= other_bound[1]: 
            print(bound,other_bound)
        elif other_bound[0] <= cds_end <= other_bound[1]:
            print(bound,other_bound)

    
    
# for gene_coords in tmp_genes_startEnd["1"]: 
#     gene_start = gene_coords[0]
#     gene_end = gene_coords[1]
#     other_gene_coords = copy.copy(tmp_genes_startEnd["1"])
#     other_gene_coords.remove(gene_coords)
#     for other_coords in other_gene_coords: 
#         if other_coords[0] <= gene_start <= other_coords[1]: 
#             #print(gene_coords,other_coords)
#         elif other_coords[0] <= gene_start <= other_coords[1]: 
#             #print(gene_coords,other_coords)
    

[69037, 70008] [69091, 70008]
[69091, 70008] [69037, 70008]
[685716, 686654] [685679, 685783]
[685716, 686654] [685679, 685783]
[685716, 686654] [685679, 685783]
[685716, 686654] [685716, 685922]
[685716, 686654] [685716, 685922]
[685716, 686654] [686532, 686690]
[685716, 686654] [686532, 686690]
[685716, 686654] [686532, 686690]
[685716, 686654] [686532, 686690]
[925922, 926013] [925942, 926013]
[925922, 926013] [925942, 926013]
[925922, 926013] [925942, 926013]
[925922, 926013] [925942, 926013]
[925922, 926013] [925942, 926013]
[925922, 926013] [925942, 926013]
[925922, 926013] [925942, 926013]
[925922, 926013] [925942, 926013]
[925922, 926013] [925942, 926013]
[925922, 926013] [925942, 926013]
[930155, 930336] [930155, 930336]
[930155, 930336] [930155, 930336]
[930155, 930336] [930155, 930336]
[930155, 930336] [930155, 930336]
[930155, 930336] [930155, 930336]
[930155, 930336] [930155, 930336]
[930155, 930336] [930155, 930336]
[930155, 930336] [930155, 930336]
[930155, 930336] [9301

KeyboardInterrupt: 

In [172]:
test_cds_bounds = {"name1":{"bounds":[[0,100], [200,300], [400,500]]}, 
                   "name2":{"bounds":[[101,190], [380,501]]}, 
                   "name3":{"bounds":[[0,50], [199,301], [430,460]]}, 
                  "name4":{"bounds":[[150,180]]}}
test_matching_names = ["name"+str(i) for i in range(1,5)]


gene_matches = [] # will be alist of lists, where each element is a list of a group of overlapping cds/exons 
match_names_tested = [] # want to collect the transcripts I test as I go along 

for match_name in tqdm(test_matching_names): 
#     print(match_name, cds_bounds[match_name]["bounds"]) 
    
      
    match_names_tested.append(match_name) # addign currnet cds/transcript to the list of those laready tested 
    
    for bound in test_cds_bounds[match_name]["bounds"]: 
        bound_start = bound[0]
        bound_end = bound[1]
        
        current_matching_bounds = [bound]
        
        for other_match_name in test_matching_names: 
            
            
            
            if other_match_name not in match_names_tested: 
#                 print(match_name, bound, other_match_name, len(cds_bounds[other_match_name]["bounds"]))
                
                match_found = False
                                
                for other_bound in test_cds_bounds[other_match_name]["bounds"]:
                    
                    if match_found == False: # match_found out here as one match per match name 
                        
                        if other_bound[0]<= bound[0] <= other_bound[1] : #here are the tests for overlap 
                            current_matching_bounds.append(other_bound)
                            match_found = True
                        elif other_bound[0]<= bound[1] <= other_bound[1] :
                            current_matching_bounds.append(other_bound)
                            match_found = True                            
                        elif bound[0]<= other_bound[0] <= bound[1] :
                            current_matching_bounds.append(other_bound)
                            match_found = True                            
                        elif bound[0]<= other_bound[0] <= bound[1] : 
                            current_matching_bounds.append(other_bound)
                            match_found = True
                            
        print(match_names_tested, current_matching_bounds)
        gene_matches.append(current_matching_bounds)
        
        
gene_group_dict[match_name] = gene_matches

100%|██████████| 4/4 [00:00<00:00, 1464.24it/s]

['name1'] [[0, 100], [0, 50]]
['name1'] [[200, 300], [199, 301]]
['name1'] [[400, 500], [380, 501], [430, 460]]
['name1', 'name2'] [[101, 190], [150, 180]]
['name1', 'name2'] [[380, 501], [430, 460]]
['name1', 'name2', 'name3'] [[0, 50]]
['name1', 'name2', 'name3'] [[199, 301]]
['name1', 'name2', 'name3'] [[430, 460]]
['name1', 'name2', 'name3', 'name4'] [[150, 180]]





In [165]:
test_cds_bounds

{'name1': {'bounds': [[0, 100], [200, 300], [400, 500]]},
 'name2': {'bounds': [[101, 190], [380, 501]]},
 'name3': {'bounds': [[0, 50], [199, 301], [430, 460]]},
 'name4': {'bounds': [[150, 180]]}}

In [154]:
[[1,2], [3,4]] == [[1,2], [3,4]]

True

In [157]:
matching_name_list[100]

['Parent=transcript:ENST00000637002',
 'Parent=transcript:ENST00000347310',
 'Parent=transcript:ENST00000395227',
 'Parent=transcript:ENST00000425614']

In [163]:
for group in gene_matches: 
    print((group))

[[0, 100], [0, 50]]
[[200, 300], [199, 301]]
[[400, 500], [380, 501], [430, 460]]
[[101, 190], [150, 180]]
[[380, 501], [430, 460]]
[[0, 50]]
[[199, 301]]
[[430, 460]]
[[150, 180]]


In [158]:
for match_name in matching_name_list[100]: 
     print(match_name, cds_bounds[match_name]["bounds"])

Parent=transcript:ENST00000637002 [[67200855, 67200897], [67206910, 67207055], [67219574, 67219730], [67236713, 67236802], [67240179, 67240281], [67255837, 67255927], [67258478, 67259128]]
Parent=transcript:ENST00000347310 [[67168121, 67168190], [67169342, 67169638], [67182836, 67182959], [67200737, 67200897], [67206910, 67207055], [67219574, 67219730], [67236713, 67236802], [67240179, 67240281], [67255837, 67255927], [67258478, 67259128]]
Parent=transcript:ENST00000395227 [[67255895, 67255927], [67258478, 67259128]]
Parent=transcript:ENST00000425614 [[67207666, 67207698], [67219574, 67219730], [67236713, 67236802], [67240179, 67240281], [67255837, 67255927], [67258478, 67259128]]


In [109]:
tmp_bound1 = [5,101]
tmp_bound2 = [10,100]
time1 = time.time()
if 5>= 10 >= 101 or 5>100>101 or 10>= 5>= 100 or 10>=101>=100: 
    print(yay)
time2 = time.time() - time1
print(time2)

8.654594421386719e-05


In [96]:
print(time1)

1603913787.1448567


In [97]:
time2

0.0001304149627685547

In [88]:
[67200855, 67200897] >= [61206910, 68207055]

True