In [88]:
import csv
import pandas as pd
from pygenprop.flat_file_parser import parse_genome_property_file
from pygenprop.results import assign_result_from_child_assignment_results, assign_property_result_from_required_steps, GenomePropertiesResults
from pygenprop.assignment_file_parser import parse_genome_property_longform_file

In [89]:
interpro_ids = []
with open('/Users/lee/Google_Drive/Neufeld_Lab/Projects/ELA_GSB_test_data_for_Lee/Jackson_InterProScan_GenProp_Results/Updated_Fall_2018/interproscan_results/C_chlorochromatii_CaD3.tsv') as inteproscan_file:
    tsv_reader = csv.reader(inteproscan_file, delimiter='\t')
    
    for row in tsv_reader:
        hit_interpro_member_id = row[4]
        
        interpro_ids.append(hit_interpro_member_id)

In [90]:
unique_interpro_ids = set(interpro_ids)

In [91]:
with open('/Users/lee/Dropbox/RandD/Repositories/genome-properties/flatfiles/genomeProperties.txt') as genome_properties_file:
    genprop_tree = parse_genome_property_file(genome_properties_file)

In [92]:
step_table = {}
property_results = {}
for leaf in genprop_tree.leafs:
    for step in leaf.steps:
        step_functional_element_assignments = []
        for functional_element in step.functional_elements:
            functional_element_evidence_assignments = []
            if functional_element.evidence:
                for evidence in functional_element.evidence:
                    if unique_interpro_ids.isdisjoint(set(evidence.evidence_identifiers)):
                        evidence_result = 'NO'
                    else:
                        evidence_result = 'YES'

                    if evidence.sufficient:
                        if evidence_result == 'YES':
                            functional_element_evidence_assignments = ['YES']
                            break
                        else:
                            functional_element_evidence_assignments.append(evidence_result)
                    else:
                        functional_element_evidence_assignments.append(evidence_result)
            else:
                functional_element_evidence_assignments.append('NO')
                    
            step_functional_element_assignments.append(assign_result_from_child_assignment_results(functional_element_evidence_assignments)) 
        step_result = assign_result_from_child_assignment_results(step_functional_element_assignments)
        
        current_genome_property_step_results = step_table.get(leaf.id)
        
        if current_genome_property_step_results:
            step_table[leaf.id][step.number] = step_result
        else:
            step_table[leaf.id] = {step.number: step_result}
    
    required_step_numbers = [step.number for step in leaf.steps if step.required]
    
    if len(required_step_numbers) > 0:
        required_step_results = [step_table[leaf.id][step_number] for step_number in required_step_numbers]
        current_property_result = assign_property_result_from_required_steps(required_step_results, leaf.threshold)
        property_results[leaf.id] = current_property_result
    else:
        all_step_results = [step_table[leaf.id].values()]
        current_property_result = assign_result_from_child_assignment_results(all_step_results)
        property_results[leaf.id] = current_property_result

In [93]:
property_result_dict = {}
for property_id, assignment in property_results.items():
    supported_steps = [step_number for step_number, step_result in step_table[property_id].items() if step_result in ['YES', 'PARTIAL']]
    property_result_dict[property_id] = {'result': assignment,
                                         'supported_steps': supported_steps}
property_result_dict['sample_name'] = 'C_chlorochromatii_CaD3_InterProScanTSV'

In [94]:
interpro_file_results = GenomePropertiesResults(property_result_dict, genome_properties_tree=genprop_tree)

In [95]:
with open('/Users/lee/Google_Drive/Neufeld_Lab/Projects/ELA_GSB_test_data_for_Lee/Jackson_InterProScan_GenProp_Results/Updated_Fall_2018/genome_properties/C_chlorochromatii_CaD3') as assignment_file:
    file_assignment_results = parse_genome_property_longform_file(assignment_file)
    genome_properties_file_result = GenomePropertiesResults(file_assignment_results, genome_properties_tree=genprop_tree)

In [96]:
property_comparison_result = pd.concat([interpro_file_results.property_results, genome_properties_file_result.property_results], axis=1, sort=False)

In [97]:
step_comparison_result = pd.concat([interpro_file_results.step_results, genome_properties_file_result.step_results], axis=1, sort=False)

In [98]:
property_transposed = property_comparison_result.transpose()

In [99]:
property_transposed[[col for col in property_transposed if not property_transposed[col].nunique()==1]].transpose()

Unnamed: 0,C_chlorochromatii_CaD3_InterProScanTSV,C_chlorochromatii_CaD3
GenProp0630,NO,PARTIAL
GenProp0218,NO,PARTIAL
GenProp0617,NO,PARTIAL
GenProp0046,NO,YES
GenProp0971,NO,YES
GenProp0970,PARTIAL,YES
GenProp0125,NO,YES
GenProp0199,NO,YES
GenProp0305,NO,YES
GenProp0729,NO,PARTIAL


In [100]:
step_transposed = step_comparison_result.transpose()

In [101]:
filtered_steps = step_transposed[[col for col in step_transposed if not step_transposed[col].nunique()==1]].transpose()

In [102]:
bad_step_ids = set([ident for ident,value in filtered_steps.index.values])

In [103]:
filtered_steps

Unnamed: 0_level_0,Unnamed: 1_level_0,C_chlorochromatii_CaD3_InterProScanTSV,C_chlorochromatii_CaD3
Genome_Property_ID,Step_Number,Unnamed: 2_level_1,Unnamed: 3_level_1
GenProp0029,2,PARTIAL,YES
GenProp0029,3,NO,YES
GenProp0029,7,NO,YES
GenProp0029,8,NO,YES
GenProp0029,14,NO,YES
GenProp0033,1,NO,YES
GenProp0033,2,NO,YES
GenProp0033,3,NO,YES
GenProp0033,4,NO,YES
GenProp0033,5,NO,YES
