In [1]:
class GenomeProperty(object):
    def __init__(self, accession_id, name, property_type, threshold=0, 
                 parent=None, references=[], databases=[], steps=[], 
                 public=False, description=None, private_notes=None):
        self.id = accession_id
        self.name = name
        self.type = property_type
        self.threshold = threshold
        self.references = references
        self.databases = databases
        self.parent = parent
        self.steps = steps
        self.public = public
        self.description = description
        self.private_notes = private_notes
        
    def __repr__(self):
        has_references = False
        has_steps = False
        has_databases = False
        
        if self.references:
            has_references = True
        
        if self.steps:
            has_steps = True
            
        if self.databases:
            has_databases = True
            
        repr_data = [str(self.id),
                     'Type: ' + str(self.type),
                     'Name: ' + str(self.name),
                     'Thresh: ' + str(self.threshold),
                     'References: ' + str(has_references),
                     'Databases: ' + str(self.type),
                     'Steps: ' + str(has_steps),
                     'Parent: ' + str(has_databases),
                     'Public: ' + str(self.public)]
        
        return ', '.join(repr_data)

In [2]:
class Reference(object):
    def __init__(self, number, pubmed_id, title, authors, citation):
        self.number = int(number)
        self.pubmed_id = pubmed_id
        self.title = title
        self.authors = authors
        self.citation = citation
    
    def __repr__(self):
        repr_data = ['Ref ' + str(self.number),
                     'Pubmed ID: ' + str(self.pubmed_id),
                     'Title: ' + str(self.title),
                     'Authors: ' + str(self.authors),
                     'Citation: ' + str(self.citation)]
        return ', '.join(repr_data)

In [3]:
class Database(object):
    def __init__(self, title, database_name, records):
        self.title = title
        self.database_name = database_name
        self.records = records
        
    def __repr__(self):
        repr_data = ['Title: ' + str(self.title), 
                     'DB_Name: ' + str(self.database_name), 
                     'DB_Records: ' + str(self.records)]
        return ', '.join(repr_data)

In [4]:
class Step(object):
    def __init__(self, number, indentifier, name, evidence, gene_ontology_id, required=False, sufficient=False):
        self.number = int(number)
        self.id = indentifier
        self.name = name
        self.evidence = evidence
        self.gene_ontology_id = gene_ontology_id
        self.required = required
        self.sufficient = sufficient
    
    def __repr__(self):
        repr_data = ['Step ' + str(self.number),
                     'ID: ' + str(self.id),
                     'Name: ' + str(self.name),
                     'Evidences: ' + str(self.evidence),
                     'Gene Ontology IDs: ' + str(self.gene_ontology_id),
                     'Required: ' + str(self.required),
                     'Sufficient: ' + str(self.sufficient)]
        return ', '.join(repr_data)

In [5]:
def parse_gen_prop_row(row):
    columns = row.split('  ')
    marker = columns[0].strip()
    content = ''.join(columns[1:]).rstrip()
    return marker, content

In [6]:
def collapse_gen_prop_rows(gen_prop_rows):
    collapsed_markers = []

    trailing_marker_content = []
    previous_marker = gen_prop_rows[0][0]
    for marker, content in gen_prop_rows:
        if marker == previous_marker:
            trailing_marker_content.append(content)
        else:
            collapsed_marker_content = ' '.join(trailing_marker_content)
            new_collapsed_marker = (previous_marker, collapsed_marker_content)
            
            collapsed_markers.append(new_collapsed_marker)
            
            previous_marker = marker
            trailing_marker_content = [content]
            
    final_collapsed_markers = collapse_step_evidence_and_gene_ontologys(collapsed_markers)
    return final_collapsed_markers

In [7]:
def collapse_step_evidence_and_gene_ontologys(gen_prop_rows):
    current_evidence = []
    current_gos = []
    final_gen_props = []
    for marker, content in gen_prop_rows:
        if marker == '--':
            if current_evidence:
                final_gen_props.append(('EV', ' '.join(current_evidence)))
                current_evidence = []
            if current_gos:
                final_gen_props.append(('TG', ' '.join(current_gos)))
                current_gos = []
            final_gen_props.append(('--',''))
        else:
            if marker == 'EV':
                current_evidence.append(content)
            elif marker == 'TG':
                current_gos.append(content)
            else:
                final_gen_props.append((marker,content))
                
    return final_gen_props

In [8]:
def collect_references(gen_prop_rows):
    reference_markers = ['RN', 'RM', 'RT', 'RA', 'RL']
    
    references = []
    current_reference = {}
    has_references=False
    for marker, content in gen_prop_rows:
        if marker in reference_markers:
            if marker in current_reference:
                references.append(Reference(number=current_reference.get('RN'), pubmed_id=current_reference.get('RM'), 
                                            title=current_reference.get('RT'), authors=current_reference.get('RA'), 
                                            citation=current_reference.get('RL')))
                current_reference = {}
                current_reference[marker] = content
            else:
                if marker == 'RN':
                    content = int(content.strip('[]'))

                current_reference[marker] = content

    references.append(Reference(number=current_reference.get('RN'), pubmed_id=current_reference.get('RM'), 
                                title=current_reference.get('RT'), authors=current_reference.get('RA'), 
                                citation=current_reference.get('RL')))
    return references

In [9]:
def parse_genome_property_rows(gen_prop_rows):
    core_markers = ['AC', 'DE', 'TP', 'TH', 'PN', 'CC', '**']
    gathered_core_markers = {}
    unique_markers = {}
    
    has_references = False
    has_databases = False
    
    for marker, content in gen_prop_rows:
        if marker == 'RN':
            has_references = True
        elif marker == 'DC':
            has_databases = True
        elif marker in core_markers:
            if marker == 'TH':
                content = int(content)
            gathered_core_markers[marker] = content
    
    if has_references:
        references = collect_references(gen_prop_rows)
    else:
        references=[]
    if has_databases:
        databases = collect_databases(gen_prop_rows)
    else:
        databases = []
        
    steps = collect_steps(gen_prop_rows)
    
    new_genome_property = GenomeProperty(accession_id=gathered_core_markers.get('AC'),
                                         name=gathered_core_markers.get('DE'),
                                         property_type=gathered_core_markers.get('TP'),
                                         threshold=gathered_core_markers.get('TH'),
                                         parent=gathered_core_markers.get('PN'),
                                         description=gathered_core_markers.get('CC'),
                                         private_notes=gathered_core_markers.get('**'),
                                         references=references,
                                         databases=databases,
                                         steps=steps)
    return new_genome_property

In [10]:
def collect_steps(gen_prop_rows):
    step_markers = ['SN','ID', 'DN', 'RQ','EV','TG']

    steps = []
    current_step = {}
    for marker, content in gen_prop_rows:
        if marker in step_markers:
            if marker in current_step:
                steps.append(Step(number=current_step.get('SN'), indentifier=current_step.get('ID'), 
                                  name=current_step.get('DN'), evidence=current_step.get('EV'), 
                                  gene_ontology_id=current_step.get('TG'), required=current_step.get('RQ'), 
                                  sufficient=current_step.get('SF')))
                current_step = {} 
                current_step[marker] = content
            else:
                if marker == 'SN':
                    content = int(content)
                elif marker == 'EV':
                    split_content = filter(None, content.split(';'))
                    cleaned_content = list(map(lambda evidence: evidence.strip(), split_content))
                    if 'sufficient' in cleaned_content:
                        current_step['SF'] = True
                    else:
                        current_step['SF'] = False   
                    content = [evidence for evidence in cleaned_content if evidence != 'sufficient']
                elif marker == 'RQ':
                    if int(content) == 1:
                        content = True
                    else:
                        content = False

                current_step[marker] = content

    steps.append(Step(number=current_step.get('SN'), indentifier=current_step.get('ID'), 
                      name=current_step.get('DN'), evidence=current_step.get('EV'), 
                      gene_ontology_id=current_step.get('TG'), required=current_step.get('RQ'), 
                      sufficient=current_step.get('SF')))
    
    return steps

In [11]:
def collect_databases(gen_prop_rows):
    database_markers = ['DC', 'DR']

    databases = []
    current_database = {}
    for marker, content in gen_prop_rows:
        if marker in database_markers:
            if marker in current_database:
                databases.append(Database(title=current_database.get('DC'), 
                                          database_name=current_database.get('DN'),
                                          records=current_database.get('DI')))

                current_database = {}
                current_database[marker] = content
            else:
                if marker == 'DR':
                    split_content = filter(None, content.split(';'))
                    cleaned_content = list(map(lambda evidence: evidence.strip(), split_content))
                    database_name = cleaned_content[0]
                    database_records = cleaned_content[1:]
                    current_database['DN'] = database_name
                    current_database['DI'] = database_records

                current_database[marker] = content

    databases.append(Database(title=current_database.get('DC'), 
                              database_name=current_database.get('DN'),
                              records=current_database.get('DI')))
    return databases

In [12]:
def parse_gen_prop_file(gen_prop_flat_path):
    gen_props = []
    current_gen_prop_rows = []
    with open(gen_prop_flat_path, 'r') as gen_prop_file:
        for line in gen_prop_file:
            if not line.strip() == '//':
                current_gen_prop_rows.append(parse_gen_prop_row(line))
            else:
                gen_props.append(parse_genome_property_rows(collapse_gen_prop_rows(current_gen_prop_rows)))
                current_gen_prop_rows = []
    
    return gen_props

In [13]:
import os

In [14]:
global_gen_prop_dir = './data'
individual_gen_prop_dirs = [directory for directory in os.listdir(global_gen_prop_dir) if 'GenProp' in directory]

In [18]:
gen_props = {}
for directory_name in individual_gen_prop_dirs:
    gen_prop_id = directory_name.strip()
    description_file_path = os.path.join(global_gen_prop_dir, gen_prop_id, 'DESC')
    status_file_path = os.path.join(global_gen_prop_dir, gen_prop_id, 'status')
    gen_props[gen_prop_id] = parse_gen_prop_file(description_file_path)[0]

ValueError: invalid literal for int() with base 10: '[2]'