In [1]:
from pygenprop.database_file_parser import parse_genome_property_file
from pygenprop.results import GenomePropertiesResults
from pygenprop.assignment_file_parser import parse_genome_property_longform_file, parse_interproscan_file
import pandas as pd

In [2]:
def generate_differential_assignments(*results):
    comparison = pd.concat(results, axis=1, sort=False)
    comparison_transposed = comparison.transpose()
    differential_assignments = comparison_transposed[[col for col in comparison_transposed if not comparison_transposed[col].nunique()==1]].transpose()
    return differential_assignments

In [3]:
with open('/Users/lee/Dropbox/RandD/Repositories/genome-properties/flatfiles/genomeProperties.txt') as genome_properties_file:
    genprop_tree = parse_genome_property_file(genome_properties_file)

In [4]:
with open('/Users/lee/Google_Drive/Neufeld_Lab/Projects/ELA_GSB_test_data_for_Lee/Jackson_InterProScan_GenProp_Results/Updated_Fall_2018/genome_properties/C_chlorochromatii_CaD3') as assignment_file:
    file_assignment_results = parse_genome_property_longform_file(assignment_file)
    genome_properties_file_result = GenomePropertiesResults(file_assignment_results, genome_properties_tree=genprop_tree)

In [5]:
with open('/Users/lee/Google_Drive/Neufeld_Lab/Projects/ELA_GSB_test_data_for_Lee/Jackson_InterProScan_GenProp_Results/Updated_Fall_2018/interproscan_results/C_chlorochromatii_CaD3.tsv') as assignment_file:
    file_assignment_results = parse_interproscan_file(assignment_file)
    file_assignment_results.sample_name = "C_chlorochromatii_CaD3_InterProScan"
    inteproscan_file_result = GenomePropertiesResults(file_assignment_results, genome_properties_tree=genprop_tree)

In [6]:
len(genprop_tree) # Number of properties in the tree.

1286

In [7]:
len(genome_properties_file_result.property_results) # Number of assigned properties from genome properties assignment file.

1286

In [8]:
len(inteproscan_file_result.property_results) # Number of assigned properties from InterProScan TSV file.

1286

In [9]:
differential_property_assignments = generate_differential_assignments(genome_properties_file_result.property_results, inteproscan_file_result.property_results)

In [10]:
len(differential_property_assignments) # Number of differing property assignments between InterProScan file and genome properties assignment file.

37

In [11]:
differential_step_assignments = generate_differential_assignments(genome_properties_file_result.step_results, inteproscan_file_result.step_results)

In [12]:
len(differential_step_assignments) # Number of differing step assignments between InterProScan file and genome properties assignment file.

87

In [13]:
len(inteproscan_file_result.step_results) # Number of steps assigned.

6525

In [14]:
global_leaf_genome_property_ids = {leaf.id for leaf in genprop_tree.leafs}
differential_genome_property_leaf_ids = set(differential_property_assignments.index.tolist()).intersection(global_leaf_genome_property_ids)

In [15]:
differential_leaf_genome_property_assignments = differential_property_assignments[differential_property_assignments.index.get_level_values(0).isin(differential_genome_property_leaf_ids)]

In [16]:
len(differential_leaf_genome_property_assignments) # Number of differing leaf step assignments between InterProScan file and genome properties assignment file.

22

In [17]:
differential_leaf_step_assignments = differential_step_assignments[differential_step_assignments.index.get_level_values(0).isin(differential_genome_property_leaf_ids)]

In [18]:
len(differential_leaf_step_assignments)

20

### Differential steps are due to genome properties perl improperly assigning yes when to steps where their are mutiple insufficent evidences and some of these are missing.
See: https://github.com/ebi-pf-team/genome-properties/issues/30

In [19]:
differential_leaf_step_assignments

Unnamed: 0_level_0,Unnamed: 1_level_0,C_chlorochromatii_CaD3,C_chlorochromatii_CaD3_InterProScan
Genome_Property_ID,Step_Number,Unnamed: 2_level_1,Unnamed: 3_level_1
GenProp0457,6,YES,NO
GenProp0457,7,YES,NO
GenProp0458,2,YES,NO
GenProp0685,1,YES,NO
GenProp0701,1,YES,NO
GenProp0715,2,YES,NO
GenProp0724,8,YES,NO
GenProp0750,1,YES,NO
GenProp0750,3,YES,NO
GenProp0754,1,YES,NO


In [20]:
differential_leaf_genome_property_assignments

Unnamed: 0,C_chlorochromatii_CaD3,C_chlorochromatii_CaD3_InterProScan
GenProp0457,PARTIAL,NO
GenProp0458,PARTIAL,NO
GenProp0617,PARTIAL,NO
GenProp0685,PARTIAL,NO
GenProp0701,PARTIAL,NO
GenProp0715,PARTIAL,NO
GenProp0724,PARTIAL,NO
GenProp0750,YES,PARTIAL
GenProp0754,YES,NO
GenProp0756,PARTIAL,NO


In [21]:
differential_property_assignments_not_caused_by_leaves = set(differential_leaf_genome_property_assignments.index.tolist()).isdisjoint({property_id for property_id, step_id in differential_leaf_step_assignments.index.tolist()})

In [22]:
differential_property_assignments_not_caused_by_leaves

False

In [26]:
print('Missmatched Properties: {:1.2}%'.format(len(differential_property_assignments)/len(inteproscan_file_result.property_results)*100))
print('Missmatched Steps: {:1.2}%'.format(len(differential_step_assignments)/len(inteproscan_file_result.step_results)*100))
print('Missmatched Leaf Properties: {:1.2}%'.format(len(differential_leaf_genome_property_assignments)/len(inteproscan_file_result.property_results)*100))
print('Missmatched Leaf Steps: {:1.2}%'.format(len(differential_leaf_step_assignments)/len(inteproscan_file_result.step_results)*100))
print('Missmatched Leaf Properties Not Caused By Missmatched Steps: {}'.format(differential_property_assignments_not_caused_by_leaves))

Missmatched Properties: 2.9%
Missmatched Steps: 1.3%
Missmatched Leaf Properties: 1.7%
Missmatched Leaf Steps: 0.31%
Missmatched Leaf Properties Not Caused By Missmatched Steps: False
