In [1]:
import pandas as pd
import re

In [2]:
def process_proteome_data(input_file_path, output_file_path):
  """
  Reads the raw proteome data, splits the BUSCO and Taxonomic lineage columns,
  and saves the processed dataframe to a new CSV file.
  """
  
  # Load the dataset
  # We use 'error_bad_lines=False' or 'on_bad_lines' to handle potential parsing issues 
  # depending on pandas version, but standard read_csv usually works for well-formed CSVs.
  try:
    df = pd.read_excel(input_file_path)
  except Exception as e:
    return f"Error reading file: {e}"

    # --- 1. Process BUSCO Column ---
    
  def parse_busco(busco_str):
    """
    Parses BUSCO string like: C:99.8%[S:99.6%,D:0.2%],F:0.0%,M:0.2%,n:904
    Returns a dictionary of values.
    """
        
    if pd.isna(busco_str):
      return {
        'Complete': None,
        'Single': None,
        'Double': None,
        'Fragmented': None,
        'Missing': None,
        'Count': None
      }
        
    # Regex to extract numbers for C, S, D, F, M, n
    # This handles the percentage signs and brackets
    c_match = re.search(r'C:([\d\.]+)%', busco_str)
    s_match = re.search(r'S:([\d\.]+)%', busco_str)
    d_match = re.search(r'D:([\d\.]+)%', busco_str)
    f_match = re.search(r'F:([\d\.]+)%', busco_str)
    m_match = re.search(r'M:([\d\.]+)%', busco_str)
    n_match = re.search(r'n:(\d+)', busco_str)

    return {
      'Complete': float(c_match.group(1)) if c_match else None,
      'Single': float(s_match.group(1)) if s_match else None,
      'Double': float(d_match.group(1)) if d_match else None,
      'Fragmented': float(f_match.group(1)) if f_match else None,
      'Missing': float(m_match.group(1)) if m_match else None,
      'Count': int(n_match.group(1)) if n_match else None
    }

  # Apply the parsing function
  busco_data = df['BUSCO'].apply(parse_busco).apply(pd.Series)
    
  # Concatenate the new columns to the original dataframe
  df = pd.concat([df, busco_data], axis=1)

  # --- 2. Process Taxonomic Lineage Column ---
    
  # Split the lineage string by comma
  # The `expand=True` argument turns the split lists into separate columns
  lineage_split = df['Taxonomic lineage'].str.split(', ', expand=True)
    
  # Assign names to the lineage columns based on standard ranks found in the file
  # Note: The depth of lineage varies. We will name them generically 
  # (Kingdom, Phylum, Class, Order, Family, Genus, Species) based on the typical order 
  # seen in your file: "cellular organisms, Archaea, Methanobacteriati, Methanobacteriota, Stenosarchaea group, Halobacteria, Halobacteriales..."
  
  # Depending on the strictness of the taxonomy in the file, we might have variable column counts.
  # We will name the first 7 columns which seem consistent in the snippet.
  new_column_names = [
    'Taxonomy_Root',         # cellular organisms
    'Taxonomy_Domain',       # Archaea
    'Taxonomy_Clade1',       # Methanobacteriati (varies)
    'Taxonomy_Phylum',       # Methanobacteriota
    'Taxonomy_Clade2',       # Stenosarchaea group
    'Taxonomy_Class',        # Halobacteria
    'Taxonomy_Order',        # Halobacteriales
    'Taxonomy_Family',       # e.g. Haloarculaceae
    'Taxonomy_Genus',        # e.g. Haloarcula
    'Taxonomy_Species'       # e.g. Haloarcula rubripromontorii (if present)
  ]
    
  # Handle cases where the split resulted in more or fewer columns than our names list
  current_cols = lineage_split.shape[1]
    
  # Generate generic names for any extra columns beyond our named list
  if current_cols > len(new_column_names):
    extra_cols = [f'Taxonomy_Level_{i+1}' for i in range(len(new_column_names), current_cols)]
    final_col_names = new_column_names + extra_cols
  else:
    final_col_names = new_column_names[:current_cols]
        
  lineage_split.columns = final_col_names
    
  # Concatenate the new lineage columns
  df = pd.concat([df, lineage_split], axis=1)

  # --- 3. Cleanup ---
    
  # Optionally drop the original raw columns if you don't need them anymore
  df.drop(columns=['BUSCO', 'Taxonomic lineage'], inplace=True)

  # Print the length of the dataset
  print(f"Dataset length: {len(df)}")

  # Save to new file
  df.to_csv(output_file_path, index=False)
    
  return df.head()

# Example usage (Replace filenames with your actual file paths)
input_csv = 'proteomes_thermoproteota.xlsx'
output_csv = 'processed_proteomes_thermoproteota_data.csv'

# Run the function
df_head = process_proteome_data(input_csv, output_csv)
print("Processing complete. First 5 rows of processed data:")
df_head

  warn("Workbook contains no default style, apply openpyxl's default")


Dataset length: 1607
Processing complete. First 5 rows of processed data:


Unnamed: 0,Proteome Id,Organism,Organism Id,Protein count,CPD,Complete,Single,Double,Fragmented,Missing,Count,Taxonomy_Root,Taxonomy_Domain,Taxonomy_Clade1,Taxonomy_Phylum,Taxonomy_Clade2,Taxonomy_Class,Taxonomy_Order,Taxonomy_Family,Taxonomy_Genus
0,UP000005867,Pyrobaculum ferrireducens,1104324,2825,Unknown,96.5,96.0,0.5,1.0,2.5,404.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Thermoproteales,Thermoproteaceae,Pyrobaculum,
1,UP000015543,Thermofilum adornatum,1365176,1896,Unknown,94.3,94.3,0.0,0.7,5.1,296.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Thermofilales,Thermofilaceae,Thermofilum,
2,UP000053352,Pyrodictium occultum,2309,1602,Unknown,96.1,95.9,0.2,0.2,3.7,491.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Desulfurococcales,Pyrodictiaceae,Pyrodictium,
3,UP000067434,Infirmifilum uzonense,1550241,1454,Unknown,82.1,82.1,0.0,0.0,17.9,296.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Thermofilales,Thermofilaceae,Infirmifilum,
4,UP000193404,Acidianus manzaensis,282676,2641,Unknown,98.0,97.2,0.8,0.2,1.8,1244.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Sulfolobales,Sulfolobaceae,Acidianus,


In [3]:
def complete_and_contamination_filter(file):
  try:
    df = pd.read_csv(file)
  except Exception as e:
    print(f"Unable to read file: {file}")
    return f'Error {e}'

  print(f'Before filter: {len(df)}')
  df = df[(df['Complete'] >= 90.0) & (df['Missing'] <= 5.0)]
  print(f'After filter: {len(df)}')
  df.to_csv(file, index=False)

  return df.head()


df_head = complete_and_contamination_filter(output_csv)
df_head

Before filter: 1607
After filter: 250


Unnamed: 0,Proteome Id,Organism,Organism Id,Protein count,CPD,Complete,Single,Double,Fragmented,Missing,Count,Taxonomy_Root,Taxonomy_Domain,Taxonomy_Clade1,Taxonomy_Phylum,Taxonomy_Clade2,Taxonomy_Class,Taxonomy_Order,Taxonomy_Family,Taxonomy_Genus
0,UP000005867,Pyrobaculum ferrireducens,1104324,2825,Unknown,96.5,96.0,0.5,1.0,2.5,404.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Thermoproteales,Thermoproteaceae,Pyrobaculum,
2,UP000053352,Pyrodictium occultum,2309,1602,Unknown,96.1,95.9,0.2,0.2,3.7,491.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Desulfurococcales,Pyrodictiaceae,Pyrodictium,
4,UP000193404,Acidianus manzaensis,282676,2641,Unknown,98.0,97.2,0.8,0.2,1.8,1244.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Sulfolobales,Sulfolobaceae,Acidianus,
5,UP000196694,Pyrodictium delaneyi,1273541,1990,Unknown,99.2,98.8,0.4,0.0,0.8,491.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Desulfurococcales,Pyrodictiaceae,Pyrodictium,
6,UP000248044,Acidianus brierleyi,41673,2858,Unknown,95.7,95.0,0.7,0.1,4.2,1244.0,cellular organisms,Archaea,Thermoproteati,Thermoproteota,Thermoprotei,Sulfolobales,Sulfolobaceae,Acidianus,
