In [None]:
import pandas as pd
from google.colab import drive
import os

1. Load data files from drive
2. Combine them into one df

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def combine_organisms_to_df(folder_path):
    """
    Combines multiple CSV files from the specified folder into a single DataFrame.
    Each CSV file should contain columns 'id' and 'name' which will be renamed to 'ID' and 'organism' respectively.

    Parameters:
    folder_path (str): The path to the folder containing the CSV files.

    Returns:
    DataFrame: A concatenated DataFrame containing all the data from the CSV files.
    """
  # List to store individual DataFrames
  df_list = []

  # Iterate over all files in the folder
  for filename in os.listdir(folder_path):
      if filename.endswith('.csv'):
          file_path = os.path.join(folder_path, filename)
          # Read the CSV file into a DataFrame
          df = pd.read_csv(file_path)
          # Append the DataFrame to the list
          df_list.append(df)

  # Concatenate all DataFrames in the list into a single DataFrame
  df = pd.concat(df_list, ignore_index=True)
  df.rename(columns={'id': 'ID', 'name': 'organism'}, inplace=True)
  return df

TATA

In [None]:
train_tata_df = combine_organisms_to_df('/content/drive/MyDrive/פרויקט גמר/DATA/train and test/train/TATA/with validation/train')
validation_tata_df = combine_organisms_to_df('/content/drive/MyDrive/פרויקט גמר/DATA/train and test/train/TATA/with validation/validation')
test_tata_df = combine_organisms_to_df('/content/drive/MyDrive/פרויקט גמר/DATA/train and test/test/TATA')

NON TATA

In [None]:
train_non_tata_df = combine_organisms_to_df('/content/drive/MyDrive/פרויקט גמר/DATA/train and test/train/non-TATA/with validation/train')
validation_non_tata_df = combine_organisms_to_df('/content/drive/MyDrive/פרויקט גמר/DATA/train and test/train/non-TATA/with validation/validation')
test_non_tata_df = combine_organisms_to_df('/content/drive/MyDrive/פרויקט גמר/DATA/train and test/test/non-TATA')

Negative - Method 2

In [None]:
neg_tata_met_2 = pd.read_csv('/content/drive/MyDrive/פרויקט גמר/DATA/negative data method 2/neg_tata_method2.csv')
neg_non_tata_met_2 = pd.read_csv('/content/drive/MyDrive/פרויקט גמר/DATA/negative data method 2/neg_non_tata_method2.csv')

### Match each negative record to its positive record

TATA train

In [None]:
combined_tata_train_df = pd.merge(train_tata_df, neg_tata_met_2, on=['ID','organism'], how='left')

In [None]:
neg_tata_train_df = combined_tata_train_df[['organism', 'ID', 'seq_y']]
neg_tata_train_df.rename(columns={'seq_y': 'seq'}, inplace=True)

TATA validation

In [None]:
combined_tata_val_df = pd.merge(validation_tata_df, neg_tata_met_2, on=['ID','organism'], how='left')

In [None]:
neg_tata_val_df = combined_tata_val_df[['organism', 'ID', 'seq_y']]
neg_tata_val_df.rename(columns={'seq_y': 'seq'}, inplace=True)

TATA test

In [None]:
combined_tata_test_df = pd.merge(test_tata_df, neg_tata_met_2, on=['ID','organism'], how='left')

In [None]:
neg_tata_test_df = combined_tata_test_df[['organism', 'ID', 'seq_y']]
neg_tata_test_df.rename(columns={'seq_y': 'seq'}, inplace=True)

NON TATA train

In [None]:
combined_non_tata_train_df = pd.merge(train_non_tata_df, neg_non_tata_met_2, on=['ID','organism'], how='left')

In [None]:
neg_non_tata_train_df = combined_non_tata_train_df[['organism', 'ID', 'seq_y']]
neg_non_tata_train_df.rename(columns={'seq_y': 'seq'}, inplace=True)

TATA validation

In [None]:
combined_non_tata_val_df = pd.merge(validation_non_tata_df, neg_non_tata_met_2, on=['ID','organism'], how='left')

In [None]:
neg_non_tata_val_df = combined_non_tata_val_df[['organism', 'ID', 'seq_y']]
neg_non_tata_val_df.rename(columns={'seq_y': 'seq'}, inplace=True)

TATA test

In [None]:
combined_non_tata_test_df = pd.merge(test_non_tata_df, neg_non_tata_met_2, on=['ID','organism'], how='left')

In [None]:
neg_non_tata_test_df = combined_non_tata_test_df[['organism', 'ID', 'seq_y']]
neg_non_tata_test_df.rename(columns={'seq_y': 'seq'}, inplace=True)

Save to drive

In [None]:
def combine_df(pos_df, neg_df, file_name):
      """
    Combines two DataFrames (positive and negative samples) into a single DataFrame.

    Parameters:
    pos_df (DataFrame): DataFrame containing positive samples.
    neg_df (DataFrame): DataFrame containing negative samples.
    file_name (str): The name of the file to save the combined DataFrame.

    Returns:
    DataFrame: The combined DataFrame with 'True_label' column.
    """
  # Add 'True label' column
  pos_df['True_label'] = 1
  neg_df['True_label'] = 0

  # Concatenate the DataFrames
  combined_df = pd.concat([pos_df, neg_df], ignore_index=True)
  combined_df.drop(columns='Unnamed: 0', inplace=True)
  combined_df.to_csv(f'/content/drive/MyDrive/פרויקט גמר/DATA/train and test/train, val, test combined method 2/{file_name}')

  return combined_df

In [None]:
combine_df(train_tata_df, neg_tata_train_df, "train_tata_met_2.csv")

In [None]:
combine_df(train_non_tata_df, neg_non_tata_train_df, "train_non_tata_met_2.csv")

In [None]:
combine_df(validation_tata_df, neg_tata_val_df, "val_tata_met_2.csv")

In [None]:
combine_df(validation_non_tata_df, neg_non_tata_val_df, "val_non_tata_met_2.csv")

In [None]:
combine_df(test_tata_df, neg_tata_test_df, "test_tata_met_2.csv")

In [None]:
combine_df(test_non_tata_df, neg_non_tata_test_df, "test_non_tata_met_2.csv")