# Reformat dataset for github

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_table('place/fine_all.tsv', index_col = 0)
df.shape

(5380, 28)

Metrics

In [3]:
df['vsratio'] = df['volume_gmean'] / df['surface_gmean']
df['svratio'] = df['surface_gmean'] / df['volume_gmean']
df['aspect_ratio'] = df['length_gmean'] / df['width_gmean']
df['spherical_equivalent_diameter'] = np.cbrt((6 * df['volume_gmean']) / np.pi)

In [4]:
df['log_vsratio'] = np.log10(df['vsratio'])

Get species with genomic information

In [5]:
df1 = df[df['node'].str.contains('G')].copy()

Because of our methodology (phylogenetic placement) some species have been assigned to the same genome, so we need to get rid of the duplicates and select only those species whose taxonomic name matches the genome name

In [6]:
dft = pd.read_table('../phylogeny/tax2tree/filled_ranks.tsv', index_col = 0)

In [7]:
df1['is_species'] = df1['node'].apply(lambda x: dft.loc[x]['species'] if x in dft.index else np.nan)

In [8]:
df1 = df1[df1['species'] == df1['is_species']]

In [9]:
df1.shape

(1363, 34)

In [10]:
sps_genomic_info = df1[df1['species'] == df1['is_species']].index

Create a new dataframe

In [11]:
columns = ['sources', 'length_ranges', 'width_ranges', 'length_gmean', 'width_gmean',
          'volume_gmean', 'surface_gmean','spherical_equivalent_diameter', 'svratio', 'vsratio', 'log_vsratio',
           'species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom']

In [12]:
df2 = df[columns].copy()

Add metadata only if genome is available in WoL

In [13]:
df2['genome'] = df2.index.map(lambda x: df.loc[x]['node'] if x in sps_genomic_info else np.nan)
df2['genome_size'] = df2.index.map(lambda x: df.loc[x]['genome'] if x in sps_genomic_info else np.nan)
df2['gc'] = df2.index.map(lambda x: df.loc[x]['gc'] if x in sps_genomic_info else np.nan)
df2['proteins'] = df2.index.map(lambda x: df.loc[x]['proteins'] if x in sps_genomic_info else np.nan)
df2['coding'] = df2.index.map(lambda x: df.loc[x]['coding'] if x in sps_genomic_info else np.nan)
df2['rrnas'] = df2.index.map(lambda x: df.loc[x]['rrnas'] if x in sps_genomic_info else np.nan)
df2['ENCprime'] = df2.index.map(lambda x: df.loc[x]['coding'] if x in sps_genomic_info else np.nan)

In [14]:
df2.rename(columns = {'length_gmean': 'length', 'width_gmean': 'width',
                     'volume_gmean': 'volume', 'surface_gmean': 'surface'}, inplace = True)

In [15]:
df2.columns

Index(['sources', 'length_ranges', 'width_ranges', 'length', 'width', 'volume',
       'surface', 'spherical_equivalent_diameter', 'svratio', 'vsratio',
       'log_vsratio', 'species', 'genus', 'family', 'order', 'class', 'phylum',
       'kingdom', 'genome', 'genome_size', 'gc', 'proteins', 'coding', 'rrnas',
       'ENCprime'],
      dtype='object')

In [16]:
df['sources'].value_counts()

sources
bacdive         4546
bm4              316
bm5              252
pcc               94
bm1               91
pubmed            43
bacdive, bm5      26
pubmed, bm4        6
bacdive, bm4       4
pubmed, bm5        1
bacdive, bm1       1
Name: count, dtype: int64

Save dataframe

In [17]:
df2.to_csv('../dataset/dataset.tsv', sep = '\t')

In [19]:
df2.shape

(5380, 25)