In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans
from ontobio.ontol_factory import OntologyFactory
from ontobio.assoc_factory import AssociationSetFactory
import os
import json

In [2]:
FOLDER = "./../data/"

#### Data preparation for clustering

In [13]:
gene_disease_csv = pd.read_csv(FOLDER+ "gene_disease_view.csv")
gene_disease_series = gene_disease_csv.groupby(["geneName"])["diseaseId"].apply(list)
gene_disease_dict = gene_disease_series.to_dict()

gene_to_list_of_diseases = []

#Could be inmproved propably somehow with pandas functionality
for gene_id, disease_ids in gene_disease_dict.items():
    new_dict = {'diseaseId': disease_ids, 'geneName': str(gene_id)}
    gene_to_list_of_diseases.append(new_dict)

####  Mapping

In [61]:
gaf_columns = [
        "DB", "DB_Object_ID", "DB_Object_Symbol", "Qualifier", "GO_ID",
        "DB:Reference", "Evidence_Code", "With_or_From", "Aspect", "DB_Object_Name",
        "DB_Object_Synonym", "DB_Object_Type", "Taxon", "Date", "Assigned_By"
]
# ignoring first column, There were was only one value and it made as double index (atleast I think so).
gaf_data = pd.read_csv(FOLDER + "goa_human.gaf", sep="\t", comment="!", names=gaf_columns, skiprows=12,usecols=range(1,16))
mapping_gene_to_go = pd.Series(gaf_data.Qualifier.values,index=gaf_data.DB_Object_ID).to_dict()

gaf_data
mapping_geneName_to_UniProt = pd.Series(gaf_data.DB.values,index=gaf_data.DB_Object_ID).to_dict()
mapping_geneName_to_UniProt["NUDT4B"]

#### Feature extraction

In [14]:
vec = DictVectorizer()
matrixGeneDisease = vec.fit_transform(gene_to_list_of_diseases)
matrixGeneDisease

<21666x51836 sparse matrix of type '<class 'numpy.float64'>'
	with 3282990 stored elements in Compressed Sparse Row format>

#### Clustering

In [15]:
cluster = KMeans()
result = cluster.fit(matrixGeneDisease)
result.labels_

array([5, 5, 5, ..., 5, 5, 5])

### Creating arrays of geneIdentificators which ontobio takes. (for each cluster)  

In [100]:
notInUniprot = []
clusters = []
for i in range(max(result.labels_) + 1):
    clusters.append([])

for i in range(len(result.labels_)):
    if gene_to_list_of_diseases[i]["geneName"] in mapping_geneName_to_UniProt:
        clusters[result.labels_[i]].append("UniProtKB:" +  mapping_geneName_to_UniProt[gene_to_list_of_diseases[i]["geneName"]])
        
    else:
        notInUniprot.append(gene_to_list_of_diseases[i]["geneName"])

In [20]:
HUMAN = 'NCBITaxon:9606'
PART_OF = 'BFO:0000050'

ofactory = OntologyFactory()
ont = ofactory.create("GO").subontology(relations=['subClassOf', PART_OF])

afactory = AssociationSetFactory()
aset = afactory.create(ontology=ont,
                       subject_category='gene',
                       object_category='function',
                       taxon=HUMAN)



### TODO: 
- Need to add geneNames which are not in UniProt, while googling the names (for example: "A1BG-AS1" or "ABALON") it seems like they are in RNACentral
- Add them to clusters where they belong...

In [119]:
#test to see if it works
for j in notInUniprot:  
    if(aset.inferred_types(i)):
        print(i)
        break

In [125]:
# looking at the origin of subjects
for i in aset.subjects:
    if i[0:14] != "RNAcentral:URS" and i[0:10] != "UniProtKB:" and i[0:14] != "ComplexPortal:":
        print(i)
        

### Metric:
- simple metric, for each pair of genes jaccard_similarity is computed and afterwards divided by the number of these pairs

In [114]:
def metricCluster(cluster):
    counter = 0
    result = 0
    size_of_cluster = len(cluster)
    for i in range(size_of_cluster):
        for j in range(i + 1, size_of_cluster):
            result += aset.jaccard_similarity(cluster[i],cluster[j])
            counter +=1
    return result/counter
            

In [115]:
metricCluster(clusters[0])

0.12333764976105262