From 164541d4f021a2325071209fe156ee0df3747b6b Mon Sep 17 00:00:00 2001 From: Saahith Pochiraju Date: Tue, 21 Nov 2017 12:05:37 -0500 Subject: [PATCH] addition of intact complex database --- kinetic_datanator/data_source/intact.py | 100 ++++++++++++++++++++---- tests/data_source/test_intact.py | 12 ++- 2 files changed, 93 insertions(+), 19 deletions(-) diff --git a/kinetic_datanator/data_source/intact.py b/kinetic_datanator/data_source/intact.py index 9ed03e1..5ae4c7f 100644 --- a/kinetic_datanator/data_source/intact.py +++ b/kinetic_datanator/data_source/intact.py @@ -7,6 +7,9 @@ from six.moves.urllib.request import urlretrieve import zipfile from six import BytesIO +from ftplib import FTP +import os + Base = sqlalchemy.ext.declarative.declarative_base() @@ -39,16 +42,81 @@ class ProteinInteractions(Base): stoich_b = Column(String(255)) interaction_type = Column(String(255)) +class ProteinComplex(Base): + """ Represents protein complexes from the IntAct Database + + Attributes: + identifier (:obj:`str`): + name (:obj:`str`): + ncbi (:obj:`str`): + subunits (:obj:`str`): + evidence (:obj:`str`): + go_annot (:obj:`str`): + desc (:obj:`str`): + source (:obj:`str`): + """ + __tablename__ = 'Protein_Complex' + + identifier = Column(String(255), primary_key = True) + name = Column(String(255)) + ncbi = Column(String(255)) + subunits = Column(String(255)) + evidence = Column(String(255)) + go_annot = Column(String(255)) + desc = Column(String(255)) + source = Column(String(255)) class IntAct(data_source.HttpDataSource): """ A local SQLite copy of the IntAct Database""" base_model = Base - ENDPOINT_DOMAINS = {'intact' : 'ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip'} + ENDPOINT_DOMAINS = {'intact' : 'ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip', + 'complex' : 'ftp://ftp.ebi.ac.uk/pub/databases/intact/complex/current/complextab/'} + def load_content(self): #Downloads Content from FTP Server + self.add_complex() + self.add_interactions() + + def add_complex(self): + if not os.path.exists(self.cache_dirname+'/intact_complex'): + os.makedirs(self.cache_dirname+'/intact_complex') + + ftp = FTP('ftp.ebi.ac.uk') + ftp.login() + ftp.cwd('/pub/databases/intact/complex/current/complextab/') + filenames = ftp.nlst() + if not os.path.exists(self.cache_dirname+'/intact_complex/'+filenames[0]): + for filename in filenames: + local_filename = self.cache_dirname+'/intact_complex/'+filename + file = open(local_filename, 'wb') + ftp.retrbinary('RETR '+filename, file.write) + file.close() + ftp.quit() + + columns = ['#Complex ac', 'Recommended name', 'Taxonomy identifier', + 'Identifiers (and stoichiometry) of molecules in complex', 'Experimental evidence' , + 'Go Annotations', 'Description', 'Source'] + + new_columns = ['identifier', 'name', 'ncbi', 'subunits', 'evidence', 'go_annot', 'desc', 'source'] + + files = os.listdir(self.cache_dirname+'/intact_complex') + for tsv in files: + if 'README' in tsv: + continue + else: + dt = pd.read_csv(self.cache_dirname+'/intact_complex/'+tsv, delimiter = '\t', encoding='utf-8') + pand = dt.loc[:, columns] + pand.columns = new_columns + pand = pand.set_index('identifier') + pand.to_sql(name = 'Protein_Complex', con=self.engine, if_exists = 'append') + self.session.commit() + + + def add_interactions(self): + if not self.cache_dirname + '/intact.txt': path = urlretrieve(self.ENDPOINT_DOMAINS['intact']) zipped = zipfile.ZipFile(BytesIO(path[0])) @@ -70,18 +138,18 @@ def load_content(self): pand.to_sql(name = 'Protein_Interactions', con=self.engine, if_exists = 'replace', chunksize = 1000) self.session.commit() - # column_list = ["#ID(s) interactor A") , "ID(s) interactor B"), "Alt. ID(s) interactor A"),\ - # "Alt. ID(s) interactor B"), "Alias(es) interactor A"), "Alias(es) interactor B"),\ - # "Interaction detection method(s)"), "Publication 1st author(s)"), "Publication Identifier(s)"),\ - # "Taxid interactor A"), "Taxid interactor B"), "Interaction type(s)"), "Source database(s)"),\ - # "Interaction identifier(s)"), "Confidence value(s)"), "Expansion method(s)"), \ - # "Biological role(s) interactor A"), "Biological role(s) interactor B"), \ - # "Experimental role(s) interactor A"), "Experimental role(s) interactor B"), \ - # "Type(s) interactor A"), "Type(s) interactor B"), "Xref(s) interactor A"),\ - # "Xref(s) interactor B"), "Interaction Xref(s)"), "Annotation(s) interactor A"),\ - # "Annotation(s) interactor B"), "Interaction annotation(s)"), "Host organism(s)"), \ - # "Interaction parameter(s)"), "Creation date"), "Update date"), \ - # "Checksum(s) interactor A"), "Checksum(s) interactor B"), "Interaction Checksum(s)"),\ - # "Negative"), "Feature(s) interactor A"), "Feature(s) interactor B"),\ - # "Stoichiometry(s) interactor A"), "Stoichiometry(s) interactor B"), "Identification method participant A"), \ - # "Identification method participant B")] + # column_list = ["#ID(s) interactor A") , "ID(s) interactor B"), "Alt. ID(s) interactor A"),\ + # "Alt. ID(s) interactor B"), "Alias(es) interactor A"), "Alias(es) interactor B"),\ + # "Interaction detection method(s)"), "Publication 1st author(s)"), "Publication Identifier(s)"),\ + # "Taxid interactor A"), "Taxid interactor B"), "Interaction type(s)"), "Source database(s)"),\ + # "Interaction identifier(s)"), "Confidence value(s)"), "Expansion method(s)"), \ + # "Biological role(s) interactor A"), "Biological role(s) interactor B"), \ + # "Experimental role(s) interactor A"), "Experimental role(s) interactor B"), \ + # "Type(s) interactor A"), "Type(s) interactor B"), "Xref(s) interactor A"),\ + # "Xref(s) interactor B"), "Interaction Xref(s)"), "Annotation(s) interactor A"),\ + # "Annotation(s) interactor B"), "Interaction annotation(s)"), "Host organism(s)"), \ + # "Interaction parameter(s)"), "Creation date"), "Update date"), \ + # "Checksum(s) interactor A"), "Checksum(s) interactor B"), "Interaction Checksum(s)"),\ + # "Negative"), "Feature(s) interactor A"), "Feature(s) interactor B"),\ + # "Stoichiometry(s) interactor A"), "Stoichiometry(s) interactor B"), "Identification method participant A"), \ + # "Identification method participant B")] diff --git a/tests/data_source/test_intact.py b/tests/data_source/test_intact.py index 4cd3a2c..bae2d3c 100644 --- a/tests/data_source/test_intact.py +++ b/tests/data_source/test_intact.py @@ -21,13 +21,19 @@ def setUpClass(self): def tearDownClass(self): shutil.rmtree(self.cache_dirname) - def test_loading(self): + def test_add_complex(self): + q = self.intact.session.query(intact.ProteinComplex).get('EBI-1256672') + + self.assertEqual(q.name, 'INO80 chromatin remodeling complex') + self.assertEqual(q.ncbi, '559292') + self.assertEqual(q.evidence, 'intact:EBI-1212520') + + + def test_add_interactions(self): q = self.intact.session.query(intact.ProteinInteractions).filter_by(interactor_a = 'uniprotkb:P27986').count() self.assertEqual(q, 274) q = self.intact.session.query(intact.ProteinInteractions).filter_by(interactor_a = 'uniprotkb:Q61824').first() self.assertEqual(q.interactor_b, 'uniprotkb:Q60631') self.assertEqual(q.publications, 'pubmed:11127814|mint:MINT-5213342') - - #TODO: Add more Tests