## Preprocessing

### 1. Lower disease names and use these names to identify diseases.

In [11]:
import pandas as pd
import networkx as nx
import localization

# Read the disease_gene.tsv file
disease_gene_data = pd.read_csv('datasets/disease_gene.tsv', sep='\t')
print("Before Lowercasing:")
print(disease_gene_data['diseaseName'].head())

# Delete duplicate
#disease_gene_data = disease_gene_data[disease_gene_data.diseaseType == 'disease'][['geneSymbol', 'diseaseName']].drop_duplicates()
# TODO : idk if this is required but it is in the teacher code, but not in the exercise

# Lowercase disease names
disease_gene_data['diseaseName'] = disease_gene_data['diseaseName'].str.lower()

print("\nAfter Lowercasing:")
print(disease_gene_data['diseaseName'].head())

Before Lowercasing:
0                Hepatomegaly
1               Schizophrenia
2         Alzheimer's Disease
3    Malignant tumor of colon
4           Colonic Neoplasms
Name: diseaseName, dtype: object

After Lowercasing:
0                hepatomegaly
1               schizophrenia
2         alzheimer's disease
3    malignant tumor of colon
4           colonic neoplasms
Name: diseaseName, dtype: object


### 2. Filter disease_gene associations, so that we only focus on those disease types that are not “group” or “phenotype”, and that we just keep those diseases that have at least 10 associated genes.

In [10]:
gene_count_per_disease = disease_gene_data.groupby('diseaseId')['geneId'].nunique()
diseases_to_delete = gene_count_per_disease[gene_count_per_disease >= 10].index
print("Diseases with more than 10 genes: ",len(gene_count_per_disease[gene_count_per_disease >= 10]))
print("Diseases with less than 10 genes: ",len(gene_count_per_disease[gene_count_per_disease < 10]))
print("\nBefore Filtering:")
print("Diseases number: ",len(disease_gene_data))

# Filter out disease types that are "group" or "phenotype"
disease_gene_data = disease_gene_data[
    (disease_gene_data['diseaseType'] != 'group') &
    (disease_gene_data['diseaseType'] != 'phenotype') &
    (disease_gene_data['diseaseId'].isin(diseases_to_delete))
]

print("\nAfter Filtering:")
print("Diseases number: ",len(disease_gene_data))

Diseases with more than 10 genes:  972
Diseases with less than 10 genes:  0

Before Filtering:
Diseases number 44959

After Filtering:
Diseases number 44959


### 3. Filter drug targets so that they are related to Humans. 

In [3]:
# Read the drug_target.csv file
drug_target_data = pd.read_csv('datasets/drug_target.csv')

filtered_drug_targets = drug_target_data[drug_target_data['organism'] == 'Human']

### 4. Use drug names to identify drugs

In [4]:
# Read the PPI.csv file
ppi_data = pd.read_csv('datasets/PPI.csv')

# ?? Don't know what to do

## Network medicine

In [5]:
ppi = pd.read_csv('datasets/ppi.csv')

ppi = (ppi[['Symbol_A','Symbol_B']]
       .drop_duplicates()
       .dropna())

gppi = nx.from_pandas_edgelist(ppi, 'Symbol_A', 'Symbol_B')

# Removing self loops
sl = nx.selfloop_edges(gppi)
gppi.remove_edges_from(sl)

In [6]:
def get_disease_module_info(dis_name, gda, ppi):
    
    d = gda[gda.diseaseName == dis_name]
    genes = list(d.geneSymbol.unique())
    genes_in_ppi = [n for n in gppi.nodes if n in genes]
    genes_in_lcc = localization.get_lcc(ppi, genes_in_ppi)
    
    print('Number of disease genes: ',len(genes))
    print('Number of disease genes in the PPI: ',len(genes_in_ppi))
    print('Number of disease genes in the LCC: ',len(genes_in_lcc))
    
    return genes, genes_in_ppi, genes_in_lcc

In [7]:
dermatisis = get_disease_module_info('dermatitis', disease_gene_data, gppi)

Number of disease genes:  16
Number of disease genes in the PPI:  16
Number of disease genes in the LCC:  7


### Disease separation
Let's compare Dermatitis and Psoriasis

In [8]:
import separation

psoriasis = get_disease_module_info('psoriasis', disease_gene_data, gppi)
separation_dermatitis_psoriasis = separation.get_separation(gppi, psoriasis[2], dermatisis[2])
print("separation_dermatitis_psoriasis : ",separation_dermatitis_psoriasis)

Number of disease genes:  57
Number of disease genes in the PPI:  57
Number of disease genes in the LCC:  30
separation_dermatitis_psoriasis :  1.6216216216216217


Let's compare Dermatitis and Skizophrenia

In [9]:
schizophrenia = get_disease_module_info('schizophrenia', disease_gene_data, gppi)
separation_dermatitis_schizophrenia = separation.get_separation(gppi, schizophrenia[2], dermatisis[2])
print("separation_dermatitis_schizophrenia : ",separation_dermatitis_schizophrenia)

Number of disease genes:  883
Number of disease genes in the PPI:  846
Number of disease genes in the LCC:  683
separation_dermatitis_schizophrenia :  1.8753623188405797
