### Data download

In [1]:
#Necessary libraries
import os
import pandas as pd
import numpy as np

In [2]:
os.getcwd() #Make sure we are working in the correct directory, if not --> os.chdir('path')

'/Users/Jose/Desktop/GitHub_Projects/gene_disease_association/src/notebooks'

In [3]:
'''First of all, we set up a random seed with Numpy by creating a seed. We will use that seed for next steps'''
seed = 42
np.random.seed(seed)

In [4]:
#Necessary library to obtain the dataset from the database Terapeutic Data Commons
#We only have to do install it if we don't already have it

#!pip install PyTDC

In [5]:
#To download the data into the data directory we have to change to that directory
os.chdir('/Users/Jose/Desktop/GitHub_Projects/gene_disease_association/data')

In [6]:
#TDC provides us the command line to download the data

'''Data Collection'''
#Gene-Disease Association 
from tdc.multi_pred import GDA
gd_data = GDA(name = 'DisGeNET')

#We can download the data already splitted into train and test, but in this case, we are going to do it by ourselves
#However, here is the command line to download the splitted data
#split = gd_data.get_split()

Downloading...
100%|██████████| 63.9M/63.9M [00:09<00:00, 7.08MiB/s]
Loading...
Done!


In [8]:
'''The TDC DB supplies us the raw data already "processed"'''
#Data to DataFrame
df = gd_data.get_data()

In [10]:
df.head(3)  #As we can see, we have 4 columns Gene_ID, Gene (coded into the one letter or IUPAC code), Disease_ID, 
            #Disease (with the disease's name and a brief description), and Y (values from 0 to 1)
            

Unnamed: 0,Gene_ID,Gene,Disease_ID,Disease,Y
0,1,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,C0019209,Hepatomegaly: Abnormal enlargement of the liver.,0.3
1,1,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,C0036341,Schizophrenia: Schizophrenia is highly heritab...,0.3
2,2,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,C0002395,Alzheimer's Disease: Alzheimer disease is the ...,0.5


In [11]:
#To not adulterate the original data, we create a copy called df 
gd = df.copy()

### Data processing 

In [12]:
#We change the Disease column to lowercase:

gd['Disease'] = gd['Disease'].str.lower()
gd.head()

Unnamed: 0,Gene_ID,Gene,Disease_ID,Disease,Y
0,1,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,C0019209,hepatomegaly: abnormal enlargement of the liver.,0.3
1,1,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,C0036341,schizophrenia: schizophrenia is highly heritab...,0.3
2,2,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,C0002395,alzheimer's disease: alzheimer disease is the ...,0.5
3,2,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,C0007102,malignant tumor of colon: a primary or metasta...,0.31
4,2,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,C0009375,colonic neoplasms: a benign or malignant neopl...,0.3


In [None]:
#Now we create a list wit the aim to separate the disease's name from the description and we split it from the ':'


In [15]:
dis_list = gd['Disease'].str.split(':')
dis_list.head(3)

0    [hepatomegaly,  abnormal enlargement of the li...
1    [schizophrenia,  schizophrenia is highly herit...
2    [alzheimer's disease,  alzheimer disease is th...
Name: Disease, dtype: object

In [16]:
#The next step is to modify the original Disease column which has the name and the description, replacing it with just the disease's name:
index = 0
name_list = [] 
for index,dis in enumerate(dis_list):
    dis = dis_list[index][0]
    name_list.append(dis)
    index += 1

#Finally we can add the name into the dataframe
gd['Disease'] = name_list
gd.head(3)

Unnamed: 0,Gene_ID,Gene,Disease_ID,Disease,Y
0,1,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,C0019209,hepatomegaly,0.3
1,1,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,C0036341,schizophrenia,0.3
2,2,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,C0002395,alzheimer's disease,0.5


In [20]:
#Also we have to apply a Label Encoder from sklearn to transform non-numerical labels to numerical
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
gd['Gene'] = le.fit_transform(gd.Gene.values)
gd['Disease'] = le.fit_transform(gd.Disease.values)

In [17]:
#For a proper data analysis we can run a get_dummies to Gene, and Disease columns and join them to the gd dataframe
#Get Dummies to dataframe
seq = pd.get_dummies(gd['Gene'])
dis_name = pd.get_dummies(gd['Disease_ID'])
gd = gd.join(seq)
gd = gd.join(dis_name)

#As we won't need some of original columns in this dataframe, we can drop them out, except the Y column, we keep that one

gd = gd.drop(['Gene_ID', 'Disease_ID'], axis = 1)
gd.head(3)

Unnamed: 0,Gene,Disease,Y,AADHQPILKTVKASDEDCQLRISDRIRETSDLEDSWDESSGAGCSQGTPSYSSSHSLFRGAVAPCQSSPMARLGVSGEPSPCTSTNRSTPGVASTPQTPVSSSRAGFVSGGDRPLTSEPPPRWARRRRRSVARTIAAELAENRRLARELSKREEEKLDRLIAIGEEASAQQDTANELRRDAVIAVRRLATAVEEATGAFQLGLEKLLQRLISNTKS,AARDSGTGGGSEKMRGSGPRGAGRRRPPSGGGDTPITPASLAGCYSAPRRAPLWTCLLLCAALRTLLASPSNEVNLLDSRTVMGDLGWIAFPKNGWEEIGEVDENYAPIHTYQVCKVMEQNQNNWLLTSWISNEGASRIFIELKFTLRDCNSLPGGLGTCKETFNMYYFESDDQNGRNIKENQYIKIDTIAADESFTELDLGDRVMKLNTEVRDVGPLSKKGFYLAFQDVGACIALVSVRVYYKKCPSVVRHLAVFPDTITGADSSQLLEVSGSCVNHSVTDEPPKMHCSAEGEWLVPIGKCMCKAGYEEKNGTCQVCRPGFFKASPHIQSCGKCPPHSYTHEEASTSCVCEKDYFRRESDPPTMACTRPPSAPRNAISNVNETSVFLEWIPPADTGGRKDVSYYIACKKCNSHAGVCEECGGHVRYLPRQSGLKNTSVMMVDLLAHTNYTFEIEAVNGVSDLSPGARQYVSVNVTTNQAAPSPVTNVKKGKIAKNSISLSWQEPDRPNGIILEYEIKYFEKDQETSYTIIKSKETTITAEGLKPASVYVFQIRARTAAGYGVFSRRFEFETTPVSVAASSDQSQIPVIAVSVTVGVILLAVVIGVLLSGSCCECGCGRASSLCAVAHPSLIWRCGYSKAKQDPEEEKMHFHNGHIKLPGVRTYIDPHTYEDPNQAVHEFAKEIEASCITIERVIGAGEFGEVCSGRLKLPGKRELPVAIKTLKVGYTEKQRRDFLGEASIMGQFDHPNIIHLEGVVTKSKPVMIVTEYMENGSLDTFLKKNDGQFTVIQLVGMLRGISAGMKYLSDMGYVHRDLAARNILINSNLVCKVSDFGLSRVLEDDPEAAYTTRGGKIPIRWTAPEAIAFRKFTSASDVWSYGIVMWEVVSYGERPYWEMTNQDVIKAVEEGYRLPSPMDCPAALYQLMLDCWQKERNSRPKFDEIVNMLDKLIRNPSSLKTLVNASCRVSNLLAEHSPLGSGAYRSVGEWLEAIKMGRYTEIFMENGYSSMDAVAQVTLE,AASEPDTAGSVRGLPTAHCPVVQDNRTLGDSSAGEIALSTRNVSETRYVSELTLVRVKVAEAGHYTMRAFHEDAEVQLSFQLQINVPVRVLELSESHPDSGEQTVRCRGRGMPQPNIIWSACRDLKRCPRELPPTLLGNSSEEESQLETNVTYWEEEQEFEVVSTLRLQHVDRPLSVRCTLRNAVGQDTQEVIVVPHSLPFKVVVISAILALVVLTIISLIILIMLWQKKPRYEIRWKVIESVSSDGHEYIYVDPMQLPYDSTWELPRDQLVLGRTLGSGAFGQVVEATAHGLSHSQATMKVAVKMLKSTARSSEKQALMSELKIMSHLGPHLNVVNLLGACTKGGPIYIITEYCRYGDLVDYLHRNKHTFLQHHSDKRRPPSAELYSNALPVGLPLPSHVSLTGESDGGYMDMSKDESVDYVPMLDMKGDVKYADIESSNYMAPYDNYVPSAPERTCRATLINESPVLSYMDLVGFSYQVANGMEFLASKNCVHRDLAARNVLICEGKLVKICDFGLARDIMRDSNYISKGSTFLPLKWMAPESIFNSLYTTLSDVWSFGILLWEIFTLGGTPYPELPMNEQFYNAIKRGYRMAQPAHASDEIYEIMQKCWEEKFEIRPPFSQLVLLLERLLGEGYKKKYQQVDEEFLRSDHPAILRSQARLPGFHGLRSPLDTSSVLYTAVQPNEGDNDYIIPLPDPKPEVADEGPLEGSPSLASSTLNEVNTSSTISCDSPLEPQDEPEPEPQLELQVEPEPELEQLPDSGCPAPRAEAEDSFL,AAWSPAAAAPLLRGIRGLPLHHRMFATQTEGELRVTQILKEKFPRATAIKVTDISGTKRRNQRDAWIADIYLSPQTLTTPWLHRCCCLRPWMNFTDIILP,AEDEEVQQRLRAAPSLLLLLLWLLAVPGANAAPRSALYSPSDPLTLLQADTVRGAVLGSRSAWAVEFFASWCGHSIAFAPTWKALAEDVKRWRPALYLAALDCAEETNSAVCRDFNIPGFPTVRFFKAFTKNGSGAVFPVAGADVQTLRERLIDALESHHDTWPPACPPLEPAKLEEIDGFFARNNEEYLALIFEKGGSYLGREVALDLSQHKGVAVARVVNTEANVVRKFGVTDFPSCYLLFRNGSVSRVPVLMESRSFYTAYLQRLSGLTREAAQTTVAPTTANKIAPTVWKLADRSKIYMADLESALHYILRIEVGRFPVLEGQRLGGPEKVCGSSGQVFPGRPLVQNFLHSVNEWLKRQKRNKIPYSFFKTALDDRKEGAVLAKKVNWIGCQGSEPHFRGFPCSCGSSSTSRLCRQLGSKCRPLTGSTQGQGGPPSHPRLAALLLRLPRLRKPLRADAAASMHRVGSPNAAVLWLWSSHNRVNARLQVPPARTPSSPRCSGHPVNFVLPATMNAWMCPCGTWKPPSTSSRPTSPQATSSWTSLQLGQLPEGCAECAAAPELAMGALELESRNSTLDPGKPEMMKSPTNTTPHVPAEGPEASRPPKLHPGLRAAPGQEPPEHMADVQRNEQDEPLGQWHLRSETQGLHCWLSPGLRRTASGALWRSGAWAAAPSSWSTSLRPAGGPSWTGRGQWLQVLGGGFSYLDISLCVGLYPCPSWACWHVHLLPGQDKALNRMLATLQPEPPGEEAGEGAAISRHLKPPDPIPSPPTPCSLSGLEVWEIQENELLQ,AGERRSHARRHPRTRRSQSHQRSAAMEPSSKKLTGRLMLAVGGAVLGSLQFGYNTGVINAPQKVIEEFYNQTWVHRYGESILPTTLTTLWSLSVAIFSVGGMIGSFSVGLFVNRFGRRNSMLMMNLLAFVSAVLMGFSKLGKSFEMLILGRFIIGVYCGLTTGFVPMYVGEVSPTALRGALGTLHQLGIVVGILIAQVFGLDSIMGNKDLWPLLLSIIFIPALLQCIVLPFCPESPRFLLINRNEENRAKSVLKKLRGTADVTHDLQEMKEESRQMMREKKVTILELFRSPAYRQPILIAVVLQLSQQLSGINAVFYYSTSIFEKAGVQQPVYATIGSGIVNTAFTVVSLFVVERAGRRTLHLIGLAGMAGCAILMTIALALLEQLPWMSYLSIVAIFGFVAFFEVGPGPIPWFIVAELFSQGPRPAAIAVAGFSNWTSNFIVGMCFQYVEQLCGPYVFIIFTVLLVLFFIFTYFKVPETKGRTFDEIASGFRQGGASQSDKTPEELFHPLGADSQV,AGTSALEVLRRATIKRSRTEAMTRDSSDEHCVDISSVGTPLARASIKSAKVDGVSYFRHKERLLRISIRHMVKSQVFYWIVLSLVALNTACVAIVHHNQPQWLTHLLYYAEFLFLGLFLLEMSLKMYGMGPRLYFHSSFNCFDFGVTVGSIFEVVWAIFRPGTSFGISVLRALRLLRIFKITKYWASLRNLVVSLMSSMKSIISLLFLLFLFIVVFALLGMQLFGGRFNFNDGTPSANFDTFPAAIMTVFQILTGEDWNEVMYNGIRSQGGVSSGMWSAIYFIVLTLFGNYTLLNVFLAIAVDNLANAQELTKDEQEEEEAFNQKHALQKAKEVSPMSAPNMPSIERDRRRRHHMSMWEPRSSHLRERRRRHHMSVWEQRTSQLRKHMQMSSQEALNREEAPTMNPLNPLNPLSSLNPLNAHPSLYRRPRAIEGLALGLALEKFEEERISRGGSLKGDGGDRSSALDNQRTPLSLGQREPPWLARPCHGNCDPTQQEAGGGEAVVTFEDRARHRQSQRRSRHRRVRTEGKESSSASRSRSASQERSLDEAMPTEGEKDHELRGNHGAKEPTIQEERAQDLRRTNSLMVSRGSGLAGGLDEADTPLVLPHPELEVGKHVVLTEQEPEGSSEQALLGNVQLDMGRVISQSEPDLSCITANTDKATTESTSVTVAIPDVDPLVDSTVVHISNKTDGEASPLKEAEIREDEEEVEKKKQKKEKRETGKAMVPHSSMFIFSTTNPIRRACHYIVNLRYFEMCILLVIAASSIALAAEDPVLTNSERNKVLRYFDYVFTGVFTFEMVIKMIDQGLILQDGSYFRDLWNILDFVVVVGALVAFALANALGTNKGRDIKTIKSLRVLRVLRPLKTIKRLPKLKAVFDCVVTSLKNVFNILIVYKLFMFIFAVIAVQLFKGKFFYCTDSSKDTEKECIGNYVDHEKNKMEVKGREWKRHEFHYDNIIWALLTLFTVSTGEGWPQVLQHSVDVTEEDRGPSRSNRMEMSIFYVVYFVVFPFFFVNIFVALIIITFQEQGDKMMEECSLEKNERACIDFAISAKPLTRYMPQNRHTFQYRVWHFVVSPSFEYTIMAMIALNTVVLMMKYYSAPCTYELALKYLNIAFTMVFSLECVLKVIAFGFLNYFRDTWNIFDFITVIGSITEIILTDSKLVNTSGFNMSFLKLFRAARLIKLLRQGYTIRILLWTFVQSFKALPYVCLLIAMLFFIYAIIGMQVFGNIKLDEESHINRHNNFRSFFGSLMLLFRSATGEAWQEIMLSCLGEKGCEPDTTAPSGQNENERCGTDLAYVYFVSFIFFCSFLMLNLFVAVIMDNFEYLTRDSSILGPHHLDEFVRVWAEYDRAACGRIHYTEMYEMLTLMSPPLGLGKRCPSKVAYKRLVLMNMPVAEDMTVHFTSTLMALIRTALDIKIAKGGADRQQLDSELQKETLAIWPHLSQKMLDLLVPMPKASDLTVGKIYAAMMIMDYYKQSKVKKQRQQLEEQKNAPMFQRMEPSSLPQEIIANAKALPYLQQDPVSGLSGRSGYPSMSPLSPQDIFQLACMDPTDDGQFQERQSLVVTDPSSMRRSFSTIRDKRSNSSWLEEFSMERSSENTYKSRRRSYHSSLRLSAHRLNSDSGHKSDTHRSGGRERGRSKERKHLLSPDVSRCNSEERGTQADWESPERRQSRSPSEGRSQTPNRQGTGSLSESSIPSVSDTSTPRRSRRQLPPVPPKPRPLLSYSSLIRHAGSISPPADGSEEGSPLTSQALESNNACLTESSNSPHPQQSQHASPQRYISEPYLALHEDSHASDCGEEETLTFEAAVATSLGRSNTIGSAPPLRHSWQMPNGHYRRRRRGGPGPGMMCGAVNNLLSDTEEDDKC,...,C4755257,C4755260,C4755264,C4755273,C4755276,C4755278,C4755299,C4755302,C4755309,C4757950
0,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,hepatomegaly,0.3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,schizophrenia,0.3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,alzheimer's disease,0.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
#We make sure that all of data is numeric
gd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52476 entries, 0 to 52475
Columns: 14497 entries, Gene to C4757950
dtypes: float64(1), int64(2), uint8(14494)
memory usage: 726.6 MB


### Machine Learning model

In [25]:

import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

In [26]:
#Now we define the X and y vars for further use in Train Test split
X = gd.drop(['Y'], axis = 1)
y = gd['Y']

In [27]:
#Train Test Split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = seed)

In [28]:
#Defining model parameters
#Also if you PC is strong enough you should run a Grid search to find the best parameters
xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 10, seed = seed, alpha=1, eta=0.2, max_depth=5)

In [29]:
#Model train
model = xgb_r.fit(train_X, train_y)

In [30]:
#Prediction:
pred = xgb_r.predict(test_X)

In [31]:
pred

array([0.380014  , 0.3851451 , 0.38685638, ..., 0.38317055, 0.3851451 ,
       0.38685638], dtype=float32)

In [32]:
# Errors
rmse = np.sqrt(MSE(test_y, pred))
print(f'MSE: {MSE(test_y, pred)}')
print(f'MAE: {MAE(test_y, pred)}')
print("RMSE : % f" %(rmse))

MSE: 0.017475753802072638
MAE: 0.09652567295995425
RMSE :  0.132196


In [36]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score

In [37]:
accuracy = cross_val_score(model, X, y, scoring='accuracy', cv = 10)
print(accuracy)

In [34]:
#If we want to save the model with pickel
import pickle
with open('/Users/Jose/Desktop/GitHub_Projects/gene_disease_association/src/models/model_V0.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
#To load the model
with open('model_V0.pkl', 'rb') as f:
  model = pickle.load(f)