In [1]:
from collections import Counter
import glob
import pandas as pd
import re
import subprocess
import matplotlib as mpl
mpl.use('Agg') 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Analyze one csv:

In [2]:
tsv_files = []
for filename in glob.iglob('../data_mining_Neo4j_v2_3_2/databases/*.tsv'):
    print(filename)
    tsv_files.append(filename)

../data_mining_Neo4j_v2_3_2/databases/db_binary_0.010000.tsv
../data_mining_Neo4j_v2_3_2/databases/db_binary_0.020000.tsv
../data_mining_Neo4j_v2_3_2/databases/db_binary_0.030000.tsv
../data_mining_Neo4j_v2_3_2/databases/db_binary_0.040000.tsv
../data_mining_Neo4j_v2_3_2/databases/db_binary_0.050000.tsv
../data_mining_Neo4j_v2_3_2/databases/db_binary_0.060000.tsv
../data_mining_Neo4j_v2_3_2/databases/db_binary_0.070000.tsv


In [3]:
cc = pd.read_csv(tsv_files[0], 
                 usecols=[1, 2, 3, 4], 
                 sep='\t')

In [4]:
cc.head()

Unnamed: 0,organism,gene,gene_product,ConnectedComponents
0,Methylobacter-123 (UID203),113410,hypothetical protein,0
1,Methylobacter-123 (UID203),10311,hypothetical protein,0
2,Methylotenera mobilis-49 (UID203),11135,hypothetical protein,0
3,Methylotenera mobilis-49 (UID203),10266,type IV pilus assembly protein PilA,0
4,Methylotenera mobilis-49 (UID203),12085,hypothetical protein,0


In [5]:
def assess_connected_components_tsv(tsv):
    results = dict()
    
    # general characteristics:
    results['# nodes in cc'] = tsv.shape[0]  # number of nodes in connected components
    results['# organisms in cc'] = len(tsv['organism'].unique().tolist())
    
    num_components = len(tsv['ConnectedComponents'].unique().tolist())
    print("num unique connected components: {}".format(num_components))
    results['# of components'] = num_components
    
    results['organism counts'] = dict(Counter(tsv['organism'])) #dict(Counter(tsv['organism']))
    
    return results


In [6]:
def assess_connected_components_tsvs(cc_files):
    summary = pd.DataFrame()
    for cc in cc_files:
        tsv = pd.read_csv(cc, usecols=[1, 2, 3, 4], sep='\t')
        print(tsv.shape)
        info_dict = assess_connected_components_tsv(tsv)
        
        # get file name
        m = re.search('/(db_binary_[.0-9]+.tsv)', cc) #.groups(1)
        info_dict['file'] = m.group(1)
        
        # get ready for Pandas
        for k, v in info_dict.items():
            info_dict[k] = [v]
        print(info_dict)
        info_df_row = pd.DataFrame(info_dict)
        summary = pd.concat([summary, info_df_row], axis=0)
        print('summary shape: {}'.format(summary.shape))
    print(summary)
    print(summary.shape[0])
    return summary

In [7]:
assess_connected_components_tsvs(tsv_files)

(228, 4)
num unique connected components: 3
{'# of components': [3], 'file': ['db_binary_0.010000.tsv'], 'organism counts': [{'Methylotenera mobilis-49 (UID203)': 24, 'Methylobacter-123 (UID203)': 204}], '# nodes in cc': [228], '# organisms in cc': [2]}
summary shape: (1, 5)
(215, 4)
num unique connected components: 3
{'# of components': [3], 'file': ['db_binary_0.020000.tsv'], 'organism counts': [{'Methylotenera mobilis-49 (UID203)': 24, 'Methylobacter-123 (UID203)': 191}], '# nodes in cc': [215], '# organisms in cc': [2]}
summary shape: (2, 5)
(79, 4)
num unique connected components: 6
{'# of components': [6], 'file': ['db_binary_0.030000.tsv'], 'organism counts': [{'Methylotenera mobilis-49 (UID203)': 23, 'Methylobacter-123 (UID203)': 56}], '# nodes in cc': [79], '# organisms in cc': [2]}
summary shape: (3, 5)
(36, 4)
num unique connected components: 6
{'# of components': [6], 'file': ['db_binary_0.040000.tsv'], 'organism counts': [{'Methylotenera mobilis-49 (UID203)': 18, 'Methylob

Unnamed: 0,# nodes in cc,# of components,# organisms in cc,file,organism counts
0,228,3,2,db_binary_0.010000.tsv,"{'Methylotenera mobilis-49 (UID203)': 24, 'Met..."
0,215,3,2,db_binary_0.020000.tsv,"{'Methylotenera mobilis-49 (UID203)': 24, 'Met..."
0,79,6,2,db_binary_0.030000.tsv,"{'Methylotenera mobilis-49 (UID203)': 23, 'Met..."
0,36,6,2,db_binary_0.040000.tsv,"{'Methylotenera mobilis-49 (UID203)': 18, 'Met..."
0,12,4,2,db_binary_0.050000.tsv,"{'Methylotenera mobilis-49 (UID203)': 7, 'Meth..."
0,10,4,2,db_binary_0.060000.tsv,"{'Methylotenera mobilis-49 (UID203)': 6, 'Meth..."
0,4,2,2,db_binary_0.070000.tsv,"{'Methylotenera mobilis-49 (UID203)': 2, 'Meth..."


In [8]:
def assess_sub_graphs(cc_files):
    summary = pd.DataFrame()
    for cc_file in cc_files: 
        
        tsv = pd.read_csv(cc_file, usecols=[1, 2, 3, 4], sep='\t')
        components = dict(Counter(tsv['ConnectedComponents'])) 
        for c in components.keys():
            print(c)
            c_info = dict()
            c_info['ConnectedComponent'] = c
            c_info['Cutoff'] = None  # TODO: parse from file name. 
            
            nodes = tsv[tsv['ConnectedComponents'] == c]
            species_counts = dict(Counter(tsv['organism']))
            c_info['nodes'] = nodes.shape[0]
            c_info['species counts'] = species_counts
            
            c_info['cross-species'] = len(species_counts.keys()) > 1
            
            # get file name
            m = re.search('/(db_binary_[.0-9]+.tsv)', cc_file) #.groups(1)
            c_info['file'] = m.group(1)
            
            for k, v in c_info.items():
                c_info[k] = [v]
            print(c_info)
        
            c_info = pd.DataFrame(c_info)
            summary = pd.concat([summary, c_info], axis=0)
        print('summary shape: {}'.format(summary.shape))
    print(summary)
    print(summary.shape[0])
    return summary

In [9]:
connected_components = assess_sub_graphs(tsv_files)

0
{'ConnectedComponent': [0], 'nodes': [224], 'file': ['db_binary_0.010000.tsv'], 'species counts': [{'Methylotenera mobilis-49 (UID203)': 24, 'Methylobacter-123 (UID203)': 204}], 'Cutoff': [None], 'cross-species': [True]}
181
{'ConnectedComponent': [181], 'nodes': [2], 'file': ['db_binary_0.010000.tsv'], 'species counts': [{'Methylotenera mobilis-49 (UID203)': 24, 'Methylobacter-123 (UID203)': 204}], 'Cutoff': [None], 'cross-species': [True]}
174
{'ConnectedComponent': [174], 'nodes': [2], 'file': ['db_binary_0.010000.tsv'], 'species counts': [{'Methylotenera mobilis-49 (UID203)': 24, 'Methylobacter-123 (UID203)': 204}], 'Cutoff': [None], 'cross-species': [True]}
summary shape: (3, 6)
0
{'ConnectedComponent': [0], 'nodes': [211], 'file': ['db_binary_0.020000.tsv'], 'species counts': [{'Methylotenera mobilis-49 (UID203)': 24, 'Methylobacter-123 (UID203)': 191}], 'Cutoff': [None], 'cross-species': [True]}
181
{'ConnectedComponent': [181], 'nodes': [2], 'file': ['db_binary_0.020000.tsv']

In [10]:
connected_components.head()

Unnamed: 0,ConnectedComponent,Cutoff,cross-species,file,nodes,species counts
0,0,,True,db_binary_0.010000.tsv,224,"{'Methylotenera mobilis-49 (UID203)': 24, 'Met..."
0,181,,True,db_binary_0.010000.tsv,2,"{'Methylotenera mobilis-49 (UID203)': 24, 'Met..."
0,174,,True,db_binary_0.010000.tsv,2,"{'Methylotenera mobilis-49 (UID203)': 24, 'Met..."
0,0,,True,db_binary_0.020000.tsv,211,"{'Methylotenera mobilis-49 (UID203)': 24, 'Met..."
0,181,,True,db_binary_0.020000.tsv,2,"{'Methylotenera mobilis-49 (UID203)': 24, 'Met..."


In [11]:
connected_components.plot.scatter(x=)

SyntaxError: invalid syntax (<ipython-input-11-c5ffa0ca1977>, line 1)

In [None]:
assess_connected_components_tsv(cc)

In [None]:
pd.DataFrame.from_dict(assess_connected_components_tsv(cc))

In [None]:
def assess_component(cc):
    # Metrics for a signle connected component.
    # return counts of each organism,
    # entropy (?)
    pass