In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
import networkx as nx

<h1>String adathalmaz feldolgozása</h1>

A string adathalmazból a legerősebb fehérje fehérje interakcióra van szükségünk. Ezek csak egyszer szerepeljenek a listában.

In [2]:
string = pd.read_csv("./data/string_9606.protein.links.v12.0.txt", sep=' ', header=0)

#remove the human prefix from gene ids
string["protein1"] = string["protein1"].str.slice(5)
string["protein2"] = string["protein2"].str.slice(5)

#normalize the combined score
string["combined_score"] = string["combined_score"] / 1000
string.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13715404 entries, 0 to 13715403
Data columns (total 3 columns):
 #   Column          Dtype  
---  ------          -----  
 0   protein1        object 
 1   protein2        object 
 2   combined_score  float64
dtypes: float64(1), object(2)
memory usage: 313.9+ MB


Azon élek megtartása, melyek erősek.

In [3]:
string = string[string["combined_score"] >= 0.7]
string.shape

(473860, 3)

A string fehérja azonosítókat tartalmaz, ezeket össze kell kötni a gén azonosítókkal.

In [4]:
gen_protein_map = pd.read_csv("./data/gene_protein_map.txt", sep='\t', header=0)
gen_protein_map = gen_protein_map.dropna()
gen_protein_map.info()

<class 'pandas.core.frame.DataFrame'>
Index: 112012 entries, 5 to 170460
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Gene stable ID     112012 non-null  object
 1   Protein stable ID  112012 non-null  object
dtypes: object(2)
memory usage: 2.6+ MB


In [5]:
#join the genes with protein
string_gene_p1 = string.merge(gen_protein_map, left_on="protein1", right_on="Protein stable ID", how='left')
string_gene_p2 = string.merge(gen_protein_map, left_on="protein2", right_on="Protein stable ID", how='left')
string_gene_p1.shape

(473860, 5)

In [6]:
# keep from the joind tables only the gene id columns, and the combined score
string_gene = DataFrame()
string_gene["gene1"] = string_gene_p1["Gene stable ID"]
string_gene[["gene2", "combined_score"]] = string_gene_p2.loc[:,["Gene stable ID", "combined_score"]]
string_gene = string_gene.dropna()
string_gene.shape

(469024, 3)

In [7]:
string_sorted = string_gene.sort_values(by=["gene1", "combined_score"], ascending=[False, False])

#keep from gene-gene connection only the strongest one
string_filtered = string_sorted.drop_duplicates(subset=['gene1', 'gene2'], keep='first')
string_filtered = string_filtered.reset_index(drop=True)
string_filtered.shape

(468792, 3)

A string adathalmaz duplán tartalmazza az éleket, mivel a legtöbb gráfos könyvtár így várja el az élek megadását ( gene1 - gene2, gene2 - gene1 formában).

In [8]:
string_filtered["tuple"] = string_filtered.apply(lambda row: tuple(sorted([row['gene1'], row['gene2']])), axis=1)
string_filtered = string_filtered.sort_values(by="tuple")
string_filtered = string_filtered.drop("tuple", axis=1)
string_filtered.head()

Unnamed: 0,gene1,gene2,combined_score
468791,ENSG00000000003,ENSG00000137575,0.741
250307,ENSG00000137575,ENSG00000000003,0.741
468786,ENSG00000000005,ENSG00000011465,0.79
460897,ENSG00000011465,ENSG00000000005,0.79
468787,ENSG00000000005,ENSG00000108821,0.774


A legnagyobbegybefüggő gráffal érdemes dolgozni, a többi a tanítás során úgy sem fog szerepet játszani, viszont elvihetik az adatokat egy rossz írányba az információikkal.

In [9]:
G = nx.from_pandas_edgelist(df=string_filtered, source='gene1', target='gene2', edge_attr='combined_score')
connected_components = list(nx.connected_components(G))
largest_cc = max(connected_components, key=len)
largest_subgraph = G.subgraph(largest_cc).copy()
# largest_subgraph.nodes()


In [10]:
edges_data = largest_subgraph.edges(data=True)
edges_list = [(u, v, d.get('combined_score', 1)) for u, v, d in edges_data]
string_connected = pd.DataFrame(edges_list, columns=['gene1', 'gene2', 'combined_score'])
string_connected.head()

Unnamed: 0,gene1,gene2,combined_score
0,ENSG00000000003,ENSG00000137575,0.741
1,ENSG00000137575,ENSG00000010278,0.899
2,ENSG00000137575,ENSG00000013725,0.876
3,ENSG00000137575,ENSG00000065427,0.807
4,ENSG00000137575,ENSG00000074319,0.859


In [11]:
string_connected.to_csv("./data/raw/gene_graph.csv", index=False, sep="\t")

In [12]:
string_filtered.shape[0]/2 - string_connected.shape[0]

215.0

In [13]:
# collect the genes from the graph, i wil need it in disgenet gene filter
g1 = string_connected["gene1"]
g2 = string_connected["gene2"]
string_genes = pd.DataFrame(pd.concat([g1, g2], ignore_index=True), columns=["genes"])
string_genes = string_genes.drop_duplicates().sort_values(by="genes").reset_index(drop=True)
string_genes

Unnamed: 0,genes
0,ENSG00000000003
1,ENSG00000000005
2,ENSG00000000419
3,ENSG00000000457
4,ENSG00000000460
...,...
15788,ENSG00000287080
15789,ENSG00000288000
15790,ENSG00000288436
15791,ENSG00000288520


<h1>Disgenet adathalmaz feldolgozása</h1>


Disgenet esetében át kell váltani a kapcsolótábla segítségével a gén névről a gén azonosító használatára.

In [19]:
gen_name_gen_id = pd.read_csv("./data/gene_name_gene_id_map.txt", sep='\t', header=0)

#rename columns
gen_name_gen_id["gene_id"] = gen_name_gen_id["Gene stable ID"]
gen_name_gen_id["gene_name"] = gen_name_gen_id["Gene name"]
gen_name_gen_id = gen_name_gen_id[["gene_id", "gene_name"]]

gen_name_gen_id = gen_name_gen_id.drop_duplicates(subset=["gene_id"])
gen_name_gen_id

Unnamed: 0,gene_id,gene_name
0,ENSG00000261657,SLC25A26
1,ENSG00000223116,AL157931.1
2,ENSG00000233440,HMGA1P6
3,ENSG00000207157,RNY3P4
4,ENSG00000229483,LINC00362
...,...,...
66973,ENSG00000262334,RPH3AL
66974,ENSG00000262737,RP11-1260E13.1
66975,ENSG00000263267,RP11-1260E13.4
66976,ENSG00000262336,RP11-1260E13.2


In [20]:
disgenet = pd.read_csv("./data/disgenet_curated_gene_disease_associations.tsv", sep='\t', header=0)
disgenet

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
0,1,A1BG,0.700,0.538,C0019209,Hepatomegaly,phenotype,C23;C06,Finding,0.30,1.000,2017.0,2017.0,1,0,CTD_human
1,1,A1BG,0.700,0.538,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.30,1.000,2015.0,2015.0,1,0,CTD_human
2,2,A2M,0.529,0.769,C0002395,Alzheimer's Disease,disease,C10;F03,Disease or Syndrome,0.50,0.769,1998.0,2018.0,3,0,CTD_human
3,2,A2M,0.529,0.769,C0007102,Malignant tumor of colon,disease,C06;C04,Neoplastic Process,0.31,1.000,2004.0,2019.0,1,0,CTD_human
4,2,A2M,0.529,0.769,C0009375,Colonic Neoplasms,group,C06;C04,Neoplastic Process,0.30,1.000,2004.0,2004.0,1,0,CTD_human
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84033,109580095,HBB-LCR,0.743,0.115,C0002875,Cooley's anemia,disease,C16;C15,Disease or Syndrome,0.30,,,,0,0,CTD_human
84034,109580095,HBB-LCR,0.743,0.115,C0005283,beta Thalassemia,disease,C16;C15,Disease or Syndrome,0.30,,,,0,0,CTD_human
84035,109580095,HBB-LCR,0.743,0.115,C0019025,Hemoglobin F Disease,disease,C16;C15,Disease or Syndrome,0.30,,,,0,0,CTD_human
84036,109580095,HBB-LCR,0.743,0.115,C0085578,Thalassemia Minor,disease,C16;C15,Disease or Syndrome,0.30,,,,0,0,CTD_human


A disgenetet érdemes, majd tovább szűrni az alapján, hogy hány ismert gén kapcsolata van.

In [21]:
#switch from gene name to gene id
disgenet_join = pd.merge(disgenet, gen_name_gen_id, left_on='geneSymbol', right_on='gene_name', how='inner')
disgenet_join = disgenet_join.drop_duplicates()
disgenet_with_gene_id = disgenet_join.drop(columns=["geneSymbol", "gene_name", "geneId"])

#reorder the colums
disgenet_with_gene_id = disgenet_with_gene_id.loc[:, ["gene_id", "diseaseId"]]
disgenet_with_gene_id.columns = ["geneId", "diseaseId"]
disgenet_with_gene_id

Unnamed: 0,geneId,diseaseId
0,ENSG00000121410,C0019209
1,ENSG00000121410,C0036341
2,ENSG00000175899,C0002395
3,ENSG00000175899,C0007102
4,ENSG00000175899,C0009375
...,...,...
92954,ENSG00000251908,C4722327
92955,ENSG00000222259,C2931456
92956,ENSG00000222259,C4722327
92957,ENSG00000207222,C2931456


In [22]:
# Keep only those genes in disgenet_result which are included in string_genes
disgenet_genes_filtered = disgenet_with_gene_id[disgenet_with_gene_id["geneId"].isin(string_genes["genes"])]
disgenet_genes_filtered

Unnamed: 0,geneId,diseaseId
0,ENSG00000121410,C0019209
1,ENSG00000121410,C0036341
2,ENSG00000175899,C0002395
3,ENSG00000175899,C0007102
4,ENSG00000175899,C0009375
...,...,...
92931,ENSG00000263155,C0087031
92932,ENSG00000263155,C3495559
92933,ENSG00000263155,C3714758
92934,ENSG00000263155,C4552091


In [23]:
disgenet_result = disgenet_genes_filtered.groupby("diseaseId").filter(lambda x: len(x) > 7)
disgenet_result

Unnamed: 0,geneId,diseaseId
0,ENSG00000121410,C0019209
1,ENSG00000121410,C0036341
2,ENSG00000175899,C0002395
3,ENSG00000175899,C0007102
4,ENSG00000175899,C0009375
...,...,...
92931,ENSG00000263155,C0087031
92932,ENSG00000263155,C3495559
92933,ENSG00000263155,C3714758
92934,ENSG00000263155,C4552091


In [24]:
disgenet_result.to_csv("./data/raw/disgenet_with_gene_id.csv", sep="\t", index=False)

<h1>GTEX adathalmaz feldolgozása</h1>

In [84]:
gtex = pd.read_csv("./data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct", sep='\t', header=0)
gtex_sorted = gtex.sort_values(by="Name", ascending=True)

# remove non interessant .x endings from gene ids
gtex_sorted["Name"] = gtex_sorted.apply(lambda row: row["Name"].split(".")[0], axis=1)
gtex_sorted= gtex_sorted.reset_index(drop=True)
gtex_sorted

Unnamed: 0,Name,Description,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
0,ENSG00000000003,TSPAN6,27.61980,24.00720,14.490600,9.333330,9.208420,7.393960,13.787200,6.189620,...,8.13486,8.84861,14.193900,7.775630,9.929470,65.609500,19.163000,28.292200,26.099300,0.071083
1,ENSG00000000005,TNMD,27.25540,12.27290,0.034486,0.132502,0.499037,0.699068,0.388347,0.110950,...,1.59104,3.41662,0.393012,0.153402,0.137227,0.184820,0.273954,0.379664,0.219229,0.000000
2,ENSG00000000419,DPM1,56.12930,53.30300,56.822000,64.709400,59.266800,63.760300,54.111700,17.159200,...,50.86900,49.23020,45.820000,55.823900,41.499000,78.189500,61.302100,70.791800,60.445300,12.483400
3,ENSG00000000457,SCYL3,7.19633,5.19482,5.183300,5.696390,6.190740,6.468040,8.823700,1.620270,...,7.67282,8.78579,6.405000,6.649510,5.297310,7.557520,9.584620,9.213180,8.623300,3.062680
4,ENSG00000000460,C1orf112,2.74343,1.85840,1.388130,1.971830,2.165610,2.358640,2.721160,0.538449,...,2.19681,2.38017,2.197030,2.565050,1.413090,10.709700,2.851260,4.001080,3.297850,0.657812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56195,ENSG00000284592,RP11-71G12.2,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
56196,ENSG00000284594,MIR7847,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
56197,ENSG00000284595,MIR6785,0.00000,0.00000,0.585770,0.000000,0.109876,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.426180,0.552425,0.000000,0.719890,0.596907,0.624359,0.000000,0.000000
56198,ENSG00000284596,MIR4467,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


A String-ben elérhető gének szerinti join a GTEx adathalmazon, mivel a gráfban szerebló gén csomópontokhoz szükséges adatok relevánsak számunkra. <br>
Ha egy gén a GTEx-ben nem szerepelt, akkor az eredeti GTEx-ben szereplő többi gén azonos tulajdonságainak az átlagát használva kiegészítem.

In [85]:
# I need the string ones too, because they wil be in the graph.
# And if the genes are not in the graph, i dont need those information from gtex
gtex_mapped = pd.merge(gtex_sorted, string_genes, left_on='Name', right_on='genes', how='right')
gtex_mapped = gtex_mapped.drop(columns=["Name"])

#move genes column to the first column
gtex_mapped = gtex_mapped[["genes"] + [col for col in gtex_mapped.columns if col != "genes"]]

In [86]:
gtex_mapped = gtex_mapped.drop(columns=["Description"])
cols = [col for col in gtex_mapped.columns if col != "genes"]
gtex_mapped.loc[:, cols] = gtex_mapped.loc[:, cols].fillna(gtex_sorted.loc[:, cols].mean())
gtex_mapped.to_csv("./data/raw/gtex_genes.csv", sep="\t", index=False)
gtex_mapped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15811 entries, 0 to 15810
Data columns (total 55 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   genes                                      15811 non-null  object 
 1   Adipose - Subcutaneous                     15811 non-null  float64
 2   Adipose - Visceral (Omentum)               15811 non-null  float64
 3   Adrenal Gland                              15811 non-null  float64
 4   Artery - Aorta                             15811 non-null  float64
 5   Artery - Coronary                          15811 non-null  float64
 6   Artery - Tibial                            15811 non-null  float64
 7   Bladder                                    15811 non-null  float64
 8   Brain - Amygdala                           15811 non-null  float64
 9   Brain - Anterior cingulate cortex (BA24)   15811 non-null  float64
 10  Brain - Caudate (basal