In [None]:
import pandas as pd
import os
import glob

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
!cp -r '/content/drive/MyDrive/פרויקט גמר/DATA/All Organisms' '/content'

Mounted at /content/drive


In [None]:
def add_human_organism(folder_path='All Organisms/human_hg38',file_type='tata'):
    """
    Reads a CSV file containing information about a human genome or promoter,
    adds an organism name column, and returns the resulting DataFrame.

    Parameters:
    folder_path (str): The path to the folder containing the files (default is 'All Organisms/human_hg38').
    file_type (str): The type of file to read ('genome' or promoter type like 'tata').

    Returns:
    pd.DataFrame: A DataFrame containing the data from the file with an added 'organism' column.
    """
  # Read each file, add the source file column, and store the DataFrame in a list
  if file_type == 'genome':
    file_path = f'/content/All Organisms/human_hg38/GRCh38_human_{file_type}.csv'
  else:
    file_path = f'/content/All Organisms/human_hg38/GRCh38_human_promoter_{file_type}.csv'
  df = pd.read_csv(file_path, skiprows=1, names=["ID", "lengths", "seq"])
  organism_name = os.path.basename(file_path).replace(f'_promoter_{file_type}.csv', '').replace('GRCh38_', '')
  df['organism'] = organism_name

  return df

In [None]:
def combine_organisms(folder_path='All Organisms',file_type='tata'):
    """
    Combines data from multiple CSV files representing different organisms into a single DataFrame.

    Parameters:
    folder_path (str): The path to the folder containing the subfolders with the files (default is 'All Organisms').
    file_type (str): The type of file to read ('genome' or promoter type like 'tata').

    Returns:
    pd.DataFrame: A DataFrame containing the combined data from all specified files with an added 'organism' column.
    """
  # Collect specific CSV file paths from the folders
  file_paths = []
  for folder in os.listdir(folder_path):
      file_path = os.path.join(folder_path, folder)
      if os.path.isdir(file_path):
          csv_files = glob.glob(os.path.join(file_path, f'{file_type}_*.csv'))
          if csv_files:
              file_paths.append(csv_files[0])

  # Read each file, add the source file column, and store the DataFrame in a list
  dfs = []
  for file_path in file_paths:
      df = pd.read_csv(file_path, skiprows=1, names=["ID", "lengths", "seq"])
      organism_name = os.path.basename(file_path).replace(f'{file_type}_', '').replace('.csv', '')
      df['organism'] = organism_name
      dfs.append(df)

  human_df = add_human_organism(folder_path='All Organisms/human_hg38',file_type=file_type)

  # Concatenate all DataFrames into one
  combined_df = pd.concat(dfs, ignore_index=True)
  combined_df = pd.concat([combined_df, human_df], ignore_index=True)
  return combined_df

TATA data combined

In [None]:
tata_combined_df = combine_organisms(folder_path='/content/All Organisms',file_type='tata')

non-TATA data combined

In [None]:
non_tata_combined_df = combine_organisms(folder_path='All Organisms',file_type='non_tata')

Save to drive

In [None]:
# Define the file path where you want to save the DataFrame
output_file_path = '/content/drive/MyDrive/פרויקט גמר/DATA/combined positive data/TATA_combined_df.csv'

# Save the DataFrame to a CSV file
tata_combined_df.to_csv(output_file_path, index=False)

In [None]:
# Define the file path where you want to save the DataFrame
output_file_path = '/content/drive/MyDrive/פרויקט גמר/DATA/combined positive data/non_TATA_combined_df.csv'

# Save the DataFrame to a CSV file
non_tata_combined_df.to_csv(output_file_path, index=False)

All data - TATA and non-TATA combined

In [None]:
all_data_combined = pd.concat([tata_combined_df, non_tata_combined_df], ignore_index=True)

In [None]:
# Define the file path where you want to save the DataFrame
output_file_path = '/content/drive/MyDrive/פרויקט גמר/DATA/combined positive data/all_data_combined_df.csv'

# Save the DataFrame to a CSV file
all_data_combined.to_csv(output_file_path, index=False)

Genome data combination

In [None]:
genome_combined_df = combine_organisms(folder_path='All Organisms',file_type='genome')

In [None]:
# Define the file path where you want to save the DataFrame
output_file_path = '/content/drive/MyDrive/פרויקט גמר/DATA/combined positive data/genome_combined_df.csv'

# Save the DataFrame to a CSV file
genome_combined_df.to_csv(output_file_path, index=False)