-
Notifications
You must be signed in to change notification settings - Fork 0
/
graph_gene_generator.py
144 lines (113 loc) · 4.68 KB
/
graph_gene_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
"""
This script groups Transcripts into Gene Units according to their genomic
overlap.
The input is a gff3 file with information on transcript overlap
(i.e. Cluster).
Upon this, graphs are generated through the NetworkX package. A graph
consists of all transcripts that cluster at some point and is considered a
Gene Unit.
The output is a gff3 file with the Gene Unit information stored as Gene.
The essential steps are:
1/ Identify all the Transcript IDs present on each
Cluster
2/ Intersect each Cluster with each other to see
which share Transcript IDs
3/ Build Graphs in which nodes are clusters and
edges represent the clusters (nodes) share
Transcript IDs
4/ Give to each Graph a unique Gene Name
"""
# This script should be placed in drac:
# $ scp RIBOSEQ/PYTHON_SCRIPTS/graph_gene_generator.py jpallares@161.116.70.109:SCRIPTS/
# Usage: graph_gene_generator.py file1.gff3
# Example: time SCRIPTS/graph_gene_generator.py DATA/INSECT_RIBOSEQ_DATA/CULEX_TARSALIS/CULEX_TARSALIS_GENOME/regular_cluster_unpacked_subset_culex_tarsalis.gff3 # This takes around 9 minutes
# Some tests: time SCRIPTS/graph_gene_generator.py DATA/INSECT_RIBOSEQ_DATA/CULEX_TARSALIS/CULEX_TARSALIS_GENOME/shuf_prova_gene_generator.gff3
# Imports #
import pyranges as pr
import extend_orfs as eo
import pandas as pd
import sys
import networkx as nx
import itertools as it
import matplotlib.pyplot as plt
# Definition #
def id_to_gene(df, d):
"""
"""
identity = set(df['ID'].values).pop()
df['Gene'] = d[identity]
return df
# Execution #
## Load gff3 and Fasta files
arg = sys.argv
g = arg[1]
p_df = pr.read_gff3(g, as_df=True) #gff3 to Dataframe
## Clean the DataFrame
p_df = p_df[['Chromosome', 'Start', 'End', 'Strand', 'ID', 'Parent',
'Feature', 'Cluster']][p_df.Feature=='CDS']
# So the Pyragnes object is correctly labelled as stranded:
p_df['Strand'] = p_df['Strand'].astype("string")
print("This is initial p_df:\n",p_df)
## Generate Cluster-ID dictionary:
dci = {} # Cluster-ID Dictionary
# Store cluster in a list, then iterate through p_df through it
clusters = p_df['Cluster'].unique().tolist()
for cluster in clusters:
t_df = p_df.loc[p_df['Cluster'] == cluster] # Temporary DataFrame
t_s = set(t_df['ID'].values) # Temporary Set
dci[cluster] = t_s
## Working with graphs:
# Generate graph and add nodes
g = nx.Graph()
g.add_nodes_from(dci)
# if node1 and node2 intersect in their transcripts, grow an edge between them
print('\nStarting with the edges!')
for combination in it.combinations(dci,2):
print('This is a combiniation:\n',combination)
intersection = dci[combination[0]].intersection(dci[combination[1]])
print('This is an intersection:\n',intersection)
if len(intersection) > 0:
# g.add_edge(combination[0],combination[1], length=100000) # length is for plot
g.add_edge(combination[0],combination[1])
print('Edges are done!')
# Show Componenents
components = nx.connected_components(g)
dig = {} # ID-Gene Dictionary
cm = 1 # Multiple transcript Counter
cs = 1 # Single transcript Counter
print('Starting to build dig.')
for component in components:
t_l = [dci[i] for i in component] # Temporary List
t_s = set(it.chain.from_iterable(t_l)) # Temporary Set
print("This is a component:\n",component)
print("This is t_s:\n",t_s)
if len(t_s) == 1:
gene_name = "s_" + str(cs)
cs += 1 # Increase Gene Number for the next one
else:
gene_name = "m_" + str(cm)
cm += 1 # Increase Gene Number for the next one
for identity in t_s:
dig[identity] = gene_name
print('dig is built.')
#print("This is dig:\n",dig)
# Graph Visualization
#plt.figure(figsize=(16, 10))
#pos = nx.spring_layout(g, k=0.3, iterations=20) # k is for node distance
#nx.draw(g, pos, with_labels=True, node_color='orange', node_size=1500, width=3)
#plt.savefig("graph_test.png")
## Give each row its corresponding Gene name using the ID-Gene Dictionary
print('About to enter lambda.')
p_df['Gene'] = "0"
p_df = p_df.groupby(by='ID').apply(lambda x: id_to_gene(x, dig))
print("\nThis is p_df towards the end:\n",p_df)
## Return output as gff3
l = arg[1].split("/")
path = ''
# Obtain the path without file name:
for element in map(lambda x: x+"/", l[:-1]): # Maybe this is
path += element # unnecessarily complex
ig = l[-1] # Input gff3
og = path + 'graph_genes_' + ig # Output gff3
eo.mod_to_gff3(p_df, path=og)