In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

In [None]:
!cp -r '/content/drive/MyDrive/פרויקט גמר/DATA/train and test/genome' '/content'

In [None]:
!cp -r '/content/drive/MyDrive/פרויקט גמר/DATA/train and test/train' '/content'

In [None]:
!cp -r '/content/drive/MyDrive/פרויקט גמר/DATA/train and test/test' '/content'

In [None]:
test_df_genome =  pd.read_csv('/content/genome/test_df_genome.csv')
train_df_genome =  pd.read_csv('/content/genome/train_df_genome.csv')
val_df_genome =  pd.read_csv('/content/genome/val_df_genome.csv')

In [None]:
genome_df = pd.concat([test_df_genome,train_df_genome,val_df_genome])

In [None]:
import copy
genome_df_copy = copy.deepcopy(genome_df)

In [None]:
# Function to clean sequences
def clean_sequence(seq):
    """
    Cleans a DNA sequence by removing ambiguous nucleotides.

    Parameters:
    seq (str): The input DNA sequence to be cleaned.

    Returns:
    str or None: The cleaned sequence in uppercase, or None if the sequence contains 'N'.
    """
    if 'N' in seq:
        return None  # Mark for deletion
    return seq.upper()

# Apply the function to the DataFrame
genome_df_copy['Subsequence'] = genome_df_copy['Subsequence'].apply(clean_sequence)

# Remove rows with sequences marked for deletion
genome_df_copy.dropna(subset=['Subsequence'], inplace=True)

In [None]:
genome_df_copy.to_csv('genome_df_copy.csv')

In [None]:
train_combined = pd.DataFrame()
val_combined = pd.DataFrame()
test_combined = pd.DataFrame()

In [None]:
# Function to split the DataFrame
def split_and_aggregate(df, organism_name):
    """
    Splits the DataFrame into training, validation, and test sets for a specified organism.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing genomic data with an 'organism_name' column.
    organism_name (str): The name of the organism for which to filter the DataFrame.

    Returns:
    tuple: A tuple containing three DataFrames:
        - train_df: The training set (80% of the filtered data).
        - val_df: The validation set (10% of the filtered data).
        - test_df: The test set (20% of the filtered data).
    """
    organism_df = df[df['organism_name'] == organism_name]

    # Splitting the data
    temp_df, test_df = train_test_split(organism_df, test_size=0.2, random_state=42)
    train_df, val_df = train_test_split(temp_df, test_size=0.2, random_state=42)

    return train_df, val_df, test_df

In [None]:
# Get unique organism names
organisms = genome_df['organism_name'].unique()

In [None]:
# Split and aggregate for each organism
for organism in organisms:
    train_df, val_df, test_df = split_and_aggregate(genome_df, organism)
    train_combined = pd.concat([train_combined, train_df])
    val_combined = pd.concat([val_combined, val_df])
    test_combined = pd.concat([test_combined, test_df])

In [None]:
train_combined['True_Label'] = 0
val_combined['True_Label'] = 0
test_combined['True_Label'] = 0

In [None]:
test_combined = test_combined[['organism_name', 'Subsequence', 'True_Label']]
test_combined.rename(columns = {'organism_name': 'name', 'Subsequence': 'seq'}, inplace=True)

# combine with positive

# train

In [None]:
celegans_tata_new_train =  pd.read_csv('/content/train/TATA/with validation/train/celegans_tata_new_train.csv')
gallus_tata_new_train =  pd.read_csv('/content/train/TATA/with validation/train/gallus_tata_new_train.csv')
human_tata_new_train =  pd.read_csv('/content/train/TATA/with validation/train/human_tata_new_train.csv')
melanogaster_tata_new_train =  pd.read_csv('/content/train/TATA/with validation/train/melanogaster_tata_new_train.csv')
mulatta_tata_new_train =  pd.read_csv('/content/train/TATA/with validation/train/mulatta_tata_new_train.csv')
musculus_tata_new_train =  pd.read_csv('/content/train/TATA/with validation/train/musculus_tata_new_train.csv')
norvegicus_tata_new_train =  pd.read_csv('/content/train/TATA/with validation/train/norvegicus_tata_new_train.csv')
rerio_tata_new_train =  pd.read_csv('/content/train/TATA/with validation/train/rerio_tata_new_train.csv')

In [None]:
train_positive_combined_tata = pd.concat([celegans_tata_new_train,gallus_tata_new_train,human_tata_new_train,
                                     melanogaster_tata_new_train,mulatta_tata_new_train,musculus_tata_new_train,norvegicus_tata_new_train,rerio_tata_new_train])

In [None]:
celegans_non_tata_new_train =  pd.read_csv('/content/train/non-TATA/with validation/train/celegans_non_tata_new_train.csv')
gallus_non_tata_new_train =  pd.read_csv('/content/train/non-TATA/with validation/train/gallus_non_tata_new_train.csv')
human_non_tata_new_train =  pd.read_csv('/content/train/non-TATA/with validation/train/human_non_tata_new_train.csv')
melanogaster_non_tata_new_train =  pd.read_csv('/content/train/non-TATA/with validation/train/melanogaster_non_tata_new_train.csv')
mulatta_non_tata_new_train =  pd.read_csv('/content/train/non-TATA/with validation/train/mulatta_non_tata_new_train.csv')
musculus_non_tata_new_train =  pd.read_csv('/content/train/non-TATA/with validation/train/musculus_non_tata_new_train.csv')
norvegicus_non_tata_new_train =  pd.read_csv('/content/train/non-TATA/with validation/train/norvegicus_non_tata_new_train.csv')
rerio_non_tata_new_train =  pd.read_csv('/content/train/non-TATA/with validation/train/rerio_non_tata_new_train.csv')


In [None]:
train_positive_combined_non_tata = pd.concat([celegans_non_tata_new_train,gallus_non_tata_new_train,human_non_tata_new_train,
                                     melanogaster_non_tata_new_train,mulatta_non_tata_new_train,
                                              musculus_non_tata_new_train,norvegicus_non_tata_new_train,
                                              rerio_non_tata_new_train])

In [None]:
train_positive_combined_tata['True_Label'] = 1
train_positive_combined_non_tata['True_Label'] = 1

In [None]:
train_positive_combined_tata = train_positive_combined_tata[['name', 'seq', 'True_Label']]
train_positive_combined_non_tata = train_positive_combined_non_tata[['name', 'seq', 'True_Label']]

In [None]:
train_combined = train_combined[['organism_name', 'Subsequence', 'True_Label']]
train_combined.rename(columns = {'organism_name': 'name', 'Subsequence': 'seq'}, inplace=True)

In [None]:
train_df = pd.concat([train_positive_combined_tata,train_positive_combined_non_tata,train_combined])

# val

In [None]:
celegans_tata_new_val =  pd.read_csv('/content/train/TATA/with validation/validation/celegans_tata_new_val.csv')
gallus_tata_new_val =  pd.read_csv('/content/train/TATA/with validation/validation/gallus_tata_new_val.csv')
human_tata_new_val =  pd.read_csv('/content/train/TATA/with validation/validation/human_tata_new_val.csv')
melanogaster_tata_new_val =  pd.read_csv('/content/train/TATA/with validation/validation/melanogaster_tata_new_val.csv')
mulatta_tata_new_val =  pd.read_csv('/content/train/TATA/with validation/validation/mulatta_tata_new_val.csv')
musculus_tata_new_val =  pd.read_csv('/content/train/TATA/with validation/validation/musculus_tata_new_val.csv')
norvegicus_tata_new_val =  pd.read_csv('/content/train/TATA/with validation/validation/norvegicus_tata_new_val.csv')
rerio_tata_new_val =  pd.read_csv('/content/train/TATA/with validation/validation/rerio_tata_new_val.csv')

In [None]:
val_positive_combined_tata = pd.concat([celegans_tata_new_val,gallus_tata_new_val,human_tata_new_val,
                                     melanogaster_tata_new_val,mulatta_tata_new_val,musculus_tata_new_val,
                                   norvegicus_tata_new_val,rerio_tata_new_val])

In [None]:
celegans_non_tata_new_val =  pd.read_csv('/content/train/non-TATA/with validation/validation/celegans_non_tata_new_val.csv')
gallus_non_tata_new_val =  pd.read_csv('/content/train/non-TATA/with validation/validation/gallus_non_tata_new_val.csv')
human_non_tata_new_val =  pd.read_csv('/content/train/non-TATA/with validation/validation/human_non_tata_new_val.csv')
melanogaster_non_tata_new_val =  pd.read_csv('/content/train/non-TATA/with validation/validation/melanogaster_non_tata_new_val.csv')
mulatta_non_tata_new_val =  pd.read_csv('/content/train/non-TATA/with validation/validation/mulatta_non_tata_new_val.csv')
musculus_non_tata_new_val =  pd.read_csv('/content/train/non-TATA/with validation/validation/musculus_non_tata_new_val.csv')
norvegicus_non_tata_new_val =  pd.read_csv('/content/train/non-TATA/with validation/validation/norvegicus_non_tata_new_val.csv')
rerio_non_tata_new_val =  pd.read_csv('/content/train/non-TATA/with validation/validation/rerio_non_tata_new_val.csv')


In [None]:
val_positive_combined_non_tata = pd.concat([celegans_non_tata_new_val,gallus_non_tata_new_val,human_non_tata_new_val,
                                     melanogaster_non_tata_new_val,mulatta_non_tata_new_val,musculus_non_tata_new_val,
                                   norvegicus_non_tata_new_val,rerio_non_tata_new_val])

In [None]:
val_positive_combined_tata['True_Label'] = 1
val_positive_combined_non_tata['True_Label'] = 1

In [None]:
val_positive_combined_tata = val_positive_combined_tata[['name', 'seq', 'True_Label']]
val_positive_combined_non_tata = val_positive_combined_non_tata[['name', 'seq', 'True_Label']]

In [None]:
val_combined = val_combined[['organism_name', 'Subsequence', 'True_Label']]
val_combined.rename(columns = {'organism_name': 'name', 'Subsequence': 'seq'}, inplace=True)

In [None]:
val_df = pd.concat([val_positive_combined_tata,val_positive_combined_non_tata,val_combined])

# test

tata

In [None]:
def split_organism_data(all_organisms_df, spesific_organism_tata, spesific_organism_non_tata, organism_name):
    """
    Splits the genomic data for a specified organism into TATA and non-TATA parts.

    Parameters:
    all_organisms_df (pd.DataFrame): The DataFrame containing data for all organisms, which must include a 'name' column.
    spesific_organism_tata (pd.DataFrame): DataFrame containing TATA sequences for the specific organism.
    spesific_organism_non_tata (pd.DataFrame): DataFrame containing non-TATA sequences for the specific organism.
    organism_name (str): The name of the organism whose data is to be filtered and split.

    Returns:
    tuple: A tuple containing two DataFrames:
        - tata_part: DataFrame containing TATA sequences.
        - non_tata_part: DataFrame containing non-TATA sequences.

    Raises:
    str: Returns 'error' if the combined length of TATA and non-TATA sequences exceeds the filtered DataFrame's length.
    """
    # Filter the all_organisms_df by the specified organism name
    filtered_df = all_organisms_df[all_organisms_df['name'] == organism_name]

    spesific_organism_tata = spesific_organism_tata[['name', 'seq', 'True_Label']]
    spesific_organism_non_tata = spesific_organism_non_tata[['name', 'seq', 'True_Label']]

    # Calculate the lengths of the specific organism data
    len_tata = len(spesific_organism_tata)
    len_non_tata = len(spesific_organism_non_tata)

    # Calculate the split ratio
    total_len = len_tata + len_non_tata
    ratio_tata = len_tata / total_len

    # Calculate the number of rows for each part based on the ratio
    len_filtered = len(filtered_df)
    if len_tata+len_non_tata > len_filtered+10:
      return 'error'

    split_point_tata = int(len_filtered * ratio_tata)

    # Split the filtered dataframe
    tata_part = filtered_df.iloc[:split_point_tata]
    tata_part = pd.concat([tata_part, spesific_organism_tata])
    non_tata_part = filtered_df.iloc[split_point_tata:]
    non_tata_part = pd.concat([non_tata_part, spesific_organism_non_tata])

    return tata_part, non_tata_part

In [None]:
celegans_tata_test =  pd.read_csv('/content/test/TATA/celegans_tata_test.csv')
celegans_non_tata_test =  pd.read_csv('/content/test/non-TATA/celegans_non_tata_test.csv')
celegans_tata_test['True_Label'] = 1
celegans_non_tata_test['True_Label'] = 1
tata_part_celegans, non_tata_part_celegans = split_organism_data(test_combined,celegans_tata_test,celegans_non_tata_test,"celegans")

In [None]:
gallus_tata_test =  pd.read_csv('/content/test/TATA/gallus_tata_test.csv')
gallus_non_tata_test =  pd.read_csv('/content/test/non-TATA/gallus_non_tata_test.csv')
gallus_tata_test['True_Label'] = 1
gallus_non_tata_test['True_Label'] = 1
tata_part_gallus, non_tata_part_gallus = split_organism_data(test_combined,gallus_tata_test,gallus_non_tata_test,"gallus")

In [None]:
human_tata_test =  pd.read_csv('/content/test/TATA/human_tata_test.csv')
human_non_tata_test =  pd.read_csv('/content/test/non-TATA/human_non_tata_test.csv')
human_tata_test['True_Label'] = 1
human_non_tata_test['True_Label'] = 1
tata_part_human, non_tata_part_human = split_organism_data(test_combined,human_tata_test,human_non_tata_test,"human")

In [None]:
melanogaster_tata_test =  pd.read_csv('/content/test/TATA/melanogaster_tata_test.csv')
melanogaster_non_tata_test =  pd.read_csv('/content/test/non-TATA/melanogaster_non_tata_test.csv')
melanogaster_tata_test['True_Label'] = 1
melanogaster_non_tata_test['True_Label'] = 1
tata_part_melanogaster, non_tata_part_melanogaster = split_organism_data(test_combined,melanogaster_tata_test,melanogaster_non_tata_test,"melanogaster")

In [None]:
mulatta_tata_test =  pd.read_csv('/content/test/TATA/mulatta_tata_test.csv')
mulatta_non_tata_test =  pd.read_csv('/content/test/non-TATA/mulatta_non_tata_test.csv')
mulatta_tata_test['True_Label'] = 1
mulatta_non_tata_test['True_Label'] = 1
tata_part_mulatta, non_tata_part_mulatta = split_organism_data(test_combined,mulatta_tata_test,mulatta_non_tata_test,"mulatta")

In [None]:
musculus_tata_test =  pd.read_csv('/content/test/TATA/musculus_tata_test.csv')
musculus_non_tata_test =  pd.read_csv('/content/test/non-TATA/musculus_non_tata_test.csv')
musculus_tata_test['True_Label'] = 1
musculus_non_tata_test['True_Label'] = 1
tata_part_musculus, non_tata_part_musculus = split_organism_data(test_combined,musculus_tata_test,musculus_non_tata_test,"musculus")

In [None]:
norvegicus_tata_test =  pd.read_csv('/content/test/TATA/norvegicus_tata_test.csv')
norvegicus_non_tata_test =  pd.read_csv('/content/test/non-TATA/norvegicus_non_tata_test.csv')
norvegicus_tata_test['True_Label'] = 1
norvegicus_non_tata_test['True_Label'] = 1
tata_part_norvegicus, non_tata_part_norvegicus = split_organism_data(test_combined,norvegicus_tata_test,norvegicus_non_tata_test,"norvegicus")

In [None]:
rerio_tata_test =  pd.read_csv('/content/test/TATA/rerio_tata_test.csv')
rerio_non_tata_test =  pd.read_csv('/content/test/non-TATA/rerio_non_tata_test.csv')
rerio_tata_test['True_Label'] = 1
rerio_non_tata_test['True_Label'] = 1
tata_part_rerio, non_tata_part_rerio = split_organism_data(test_combined,rerio_tata_test,rerio_non_tata_test,"rerio")

In [None]:
test_tata = pd.concat([tata_part_celegans,tata_part_gallus,tata_part_human,tata_part_melanogaster,
                       tata_part_mulatta,tata_part_musculus,tata_part_norvegicus,tata_part_rerio])

In [None]:
test_non_tata = pd.concat([non_tata_part_celegans,non_tata_part_gallus,non_tata_part_human,non_tata_part_melanogaster,
                           non_tata_part_mulatta,non_tata_part_musculus,non_tata_part_norvegicus,non_tata_part_rerio])

# upload to shared drive

In [None]:
base_path = '/content/drive/MyDrive/פרויקט גמר/DATA/train and test/exp2 - genone'

In [None]:
# Save the combined DataFrames to CSV
train_df.to_csv(os.path.join(base_path, 'train_exp2.csv'), index=False)
val_df.to_csv(os.path.join(base_path, 'val_exp2.csv'), index=False)
test_tata.to_csv(os.path.join(base_path, 'test_tata_exp2.csv'), index=False)
test_non_tata.to_csv(os.path.join(base_path, 'test_non_tata_exp2.csv'), index=False)