## Loading libraries of interest

In [1]:
# Plotting libraries
import matplotlib.pyplot as plt

# The famous numpy library
import numpy as np

# os library
import os

# Data parsing library, it's a happy little panda! :D
import pandas
import openpyxl
import re

What we are going to _do_ below is 

1. list the files were interested in
1. load in the data from the files using panda data frames
1. look at the data, group it by genus and so on

In [2]:
# Name of folder containing csv files
csv_directory = "csv"

# Finding file names
fileNames = os.listdir(csv_directory)
fileNames.sort()

num_files = len(fileNames)

fileNames

['Mycoplasma_arginini_table.csv',
 'Mycoplasma_arthritidis_table.csv',
 'Mycoplasma_bovis_PG45_table.csv',
 'Mycoplasma_bovoculi_table.csv',
 'Mycoplasma_canis_table.csv',
 'Mycoplasma_capricolum_table.csv',
 'Mycoplasma_cloacale_table.csv',
 'Mycoplasma_columbinum_table.csv',
 'Mycoplasma_conjunctivae_table.csv',
 'Mycoplasma_crocodyli_table.csv',
 'Mycoplasma_cynos_table.csv',
 'Mycoplasma_dispar_table.csv',
 'Mycoplasma_fermentans_table.csv',
 'Mycoplasma_flocculare_table.csv',
 'Mycoplasma_gallisepticum_table.csv',
 'Mycoplasma_genitalium_G37_table.csv',
 'Mycoplasma_glycophilum_table.csv',
 'Mycoplasma_haemofelis_Langford1_table.csv',
 'Mycoplasma_haemofelis_Ohio2_table.csv',
 'Mycoplasma_hyopneumoniae_table.csv',
 'Mycoplasma_hyorhinis_table.csv',
 'Mycoplasma_iowae_695_table.csv',
 'Mycoplasma_lavaretus_table.csv',
 'Mycoplasma_leachii_table.csv',
 'Mycoplasma_mobile_163K_table.csv',
 'Mycoplasma_mycoides_table.csv',
 'Mycoplasma_mykiss_table.csv',
 'Mycoplasma_ovipneumoniae_tab

In [3]:
# Directory to all excel files
csv_files = []
for name in fileNames :
    csv_files += [csv_directory + "/" + name]
    
csv_files

['csv/Mycoplasma_arginini_table.csv',
 'csv/Mycoplasma_arthritidis_table.csv',
 'csv/Mycoplasma_bovis_PG45_table.csv',
 'csv/Mycoplasma_bovoculi_table.csv',
 'csv/Mycoplasma_canis_table.csv',
 'csv/Mycoplasma_capricolum_table.csv',
 'csv/Mycoplasma_cloacale_table.csv',
 'csv/Mycoplasma_columbinum_table.csv',
 'csv/Mycoplasma_conjunctivae_table.csv',
 'csv/Mycoplasma_crocodyli_table.csv',
 'csv/Mycoplasma_cynos_table.csv',
 'csv/Mycoplasma_dispar_table.csv',
 'csv/Mycoplasma_fermentans_table.csv',
 'csv/Mycoplasma_flocculare_table.csv',
 'csv/Mycoplasma_gallisepticum_table.csv',
 'csv/Mycoplasma_genitalium_G37_table.csv',
 'csv/Mycoplasma_glycophilum_table.csv',
 'csv/Mycoplasma_haemofelis_Langford1_table.csv',
 'csv/Mycoplasma_haemofelis_Ohio2_table.csv',
 'csv/Mycoplasma_hyopneumoniae_table.csv',
 'csv/Mycoplasma_hyorhinis_table.csv',
 'csv/Mycoplasma_iowae_695_table.csv',
 'csv/Mycoplasma_lavaretus_table.csv',
 'csv/Mycoplasma_leachii_table.csv',
 'csv/Mycoplasma_mobile_163K_table.cs

In [4]:
cow_data_frames = []
for fname in csv_files :
    # read in dataframe
    cow_data_frame = pandas.read_csv(fname, delimiter=",")
    # append data frame to the list of data frames
    cow_data_frames += [cow_data_frame]
   
cow_data_frames    

[                                             Category  \
 0    Cofactors, Vitamins, Prosthetic Groups, Pigments   
 1    Cofactors, Vitamins, Prosthetic Groups, Pigments   
 2    Cofactors, Vitamins, Prosthetic Groups, Pigments   
 3    Cofactors, Vitamins, Prosthetic Groups, Pigments   
 4    Cofactors, Vitamins, Prosthetic Groups, Pigments   
 ..                                                ...   
 309                                     Carbohydrates   
 310                                     Carbohydrates   
 311                                     Carbohydrates   
 312                                     Carbohydrates   
 313                                     Carbohydrates   
 
                                            Subcategory  \
 0    Cofactors, Vitamins, Prosthetic Groups, Pigmen...   
 1                                 Riboflavin, FMN, FAD   
 2                                 Riboflavin, FMN, FAD   
 3                                 Riboflavin, FMN, FAD   
 4     

### Grouping by genus, summing the abundaces in each group, and then sorting

In [5]:
grouped_cat_dfs = [cow_df.groupby("Subsystem").count().reset_index()  for cow_df in cow_data_frames]

In [6]:
grouped_cat_dfs

[                                           Subsystem  Category  Subcategory  \
 0    16S rRNA modification within P site of ribosome         3            3   
 1    ABC transporter alkylphosphonate (TC 3.A.1.9.1)         2            2   
 2        ABC transporter oligopeptide (TC 3.A.1.5.1)         2            2   
 3                             Adenosyl nucleosidases         3            3   
 4                               Alanine biosynthesis         1            1   
 ..                                               ...       ...          ...   
 99                          tRNA aminoacylation, Trp         1            1   
 100                         tRNA aminoacylation, Tyr         1            1   
 101                         tRNA aminoacylation, Val         1            1   
 102                                  tRNA processing         3            3   
 103                                            tRNAs         7            7   
 
      Role  Features  
 0       3     

In [7]:
data = pandas.concat([pandas.read_csv(fname).assign(New=os.path.basename(fname).split('.')[0]) for fname in csv_files])
data

Unnamed: 0,Category,Subcategory,Subsystem,Role,Features,New
0,"Cofactors, Vitamins, Prosthetic Groups, Pigments","Cofactors, Vitamins, Prosthetic Groups, Pigmen...",Thiamin biosynthesis,tRNA S(4)U 4-thiouridine synthase (former ThiI),fig|2094.47.peg.217,Mycoplasma_arginini_table
1,"Cofactors, Vitamins, Prosthetic Groups, Pigments","Riboflavin, FMN, FAD","Riboflavin, FMN and FAD metabolism in plants",FMN adenylyltransferase (EC 2.7.7.2),fig|2094.47.peg.123,Mycoplasma_arginini_table
2,"Cofactors, Vitamins, Prosthetic Groups, Pigments","Riboflavin, FMN, FAD","Riboflavin, FMN and FAD metabolism in plants",FIG000859: hypothetical protein YebC,fig|2094.47.peg.84,Mycoplasma_arginini_table
3,"Cofactors, Vitamins, Prosthetic Groups, Pigments","Riboflavin, FMN, FAD","Riboflavin, FMN and FAD metabolism in plants",tRNA pseudouridine synthase B (EC 4.2.1.70),fig|2094.47.peg.121,Mycoplasma_arginini_table
4,"Cofactors, Vitamins, Prosthetic Groups, Pigments","Riboflavin, FMN, FAD","Riboflavin, FMN and FAD metabolism in plants",Riboflavin kinase (EC 2.7.1.26),fig|2094.47.peg.123,Mycoplasma_arginini_table
...,...,...,...,...,...,...
453,Carbohydrates,Sugar alcohols,Mannitol Utilization,HPr kinase/phosphorylase (EC 2.7.1.-) (EC 2.7....,fig|743967.4.peg.482,Mycoplasma_yeatsii_table
454,Carbohydrates,Monosaccharides,Mannose Metabolism,Mannose-6-phosphate isomerase (EC 5.3.1.8),fig|743967.4.peg.520,Mycoplasma_yeatsii_table
455,Carbohydrates,Monosaccharides,Deoxyribose and Deoxynucleoside Catabolism,Purine nucleoside phosphorylase (EC 2.4.2.1),fig|743967.4.peg.694,Mycoplasma_yeatsii_table
456,Carbohydrates,Monosaccharides,Deoxyribose and Deoxynucleoside Catabolism,Thymidine phosphorylase (EC 2.4.2.4),"fig|743967.4.peg.633, fig|743967.4.peg.691",Mycoplasma_yeatsii_table


In [8]:
data.to_excel("data_test.xlsx")

In [9]:
grouped_cat_dfs = data.groupby(['Subcategory','Subsystem','New', 'Category']).count().reset_index()
grouped_cat_dfs

Unnamed: 0,Subcategory,Subsystem,New,Category,Role,Features
0,ABC transporters,ABC transporter alkylphosphonate (TC 3.A.1.9.1),Mycoplasma_arginini_table,Membrane Transport,2,2
1,ABC transporters,ABC transporter alkylphosphonate (TC 3.A.1.9.1),Mycoplasma_bovis_PG45_table,Membrane Transport,3,3
2,ABC transporters,ABC transporter alkylphosphonate (TC 3.A.1.9.1),Mycoplasma_capricolum_table,Membrane Transport,4,4
3,ABC transporters,ABC transporter alkylphosphonate (TC 3.A.1.9.1),Mycoplasma_crocodyli_table,Membrane Transport,3,3
4,ABC transporters,ABC transporter alkylphosphonate (TC 3.A.1.9.1),Mycoplasma_flocculare_table,Membrane Transport,1,1
...,...,...,...,...,...,...
4188,Triacylglycerols,Triacylglycerol metabolism,Mycoplasma_mycoides_table,"Fatty Acids, Lipids, and Isoprenoids",3,3
4189,Triacylglycerols,Triacylglycerol metabolism,Mycoplasma_yeatsii_table,"Fatty Acids, Lipids, and Isoprenoids",3,3
4190,Uni- Sym- and Antiporters,"NhaA, NhaD and Sodium-dependent phosphate tran...",Mycoplasma_cynos_table,Membrane Transport,1,1
4191,Uni- Sym- and Antiporters,"NhaA, NhaD and Sodium-dependent phosphate tran...",Mycoplasma_hyorhinis_table,Membrane Transport,1,1


In [10]:
grouped_cat_dfs.to_excel("Rast_Subcategory_Comparison.xlsx")