In [None]:
import re
import pandas as pd
from google.colab import drive
import copy
from collections import Counter

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
distances = pd.read_csv('/content/drive/MyDrive/פרויקט גמר/DATA/train and test/distance.csv', header=None)
clusters_info = '/content/drive/MyDrive/פרויקט גמר/DATA/train and test/est_all_data.clstr'

#### extracting the clusters

In [None]:
def parse_cluster_file(file_path):
    """
    Parses a cluster file and extracts cluster information into a dictionary.

    Parameters:
    file_path (str): The path to the cluster file.

    Returns:
    dict: A dictionary where keys are cluster identifiers and values are lists of lists.
          Each inner list contains species, chromosome, and identifier information.
    """
    clusters = {}
    current_cluster = None

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith(">Cluster"):
                current_cluster = line[1:]
                clusters[current_cluster] = []
            else:
                match = re.search(r', >(.*)', line)
                if match and current_cluster is not None:
                    value = match.group(1)
                    parts = value.split('|')
                    species = parts[0]
                    rest = parts[1].split('/')
                    chromosome = rest[0]
                    identifier = rest[1]
                    clusters[current_cluster].append([species, chromosome, identifier])
    return clusters

In [None]:
clusters = parse_cluster_file(clusters_info)

#### putting the clusters bigger then 1 in train

In [None]:
train_initial = []
all_other = []
# first extract all clusters greater then 1
for c in list(clusters.items()):
  k = c[0]
  v = c[1]
  if len(v) > 1:
    train_initial.append(v)
  else:
    all_other.append(v)

# now, split the sub list such that 'train' will be a list of lists
train = []
for i in train_initial:
  for sublist in i:
    train.append(sublist)

#### extracting from distances all records that apear in the train - then putting them also in train

In [None]:
rows_list = distances.values.tolist()

In [None]:
def extract_ids_from_rows(rows_list):
    """
    Extracts specific identifiers from a list of rows.

    Parameters:
    rows_list (list of lists): The input list where each sub-list contains species, chromosome, and identifier information.

    Returns:
    list of lists: A cleaned list where each sub-list contains species, chromosome, and extracted identifiers.
    """
    cleaned_rows = []
    id_pattern = re.compile(r"'>(FP\d+)'")

    for lst in rows_list:
        tmp = [lst[0], lst[1]]  # Append the first element (species) and second element (chromosome)
        for vals in lst[2:]:  # Start from the third element
            if isinstance(vals, str):
                match = id_pattern.search(vals)
                if match:
                    tmp.append(match.group(1))
        cleaned_rows.append(tmp)

    return cleaned_rows

cleaned_rows = extract_ids_from_rows(rows_list)

making a train doctionary for easier working

In [None]:
train_dict = {}
for seq in train:
  org = seq[0]
  id = seq[2]
  if org not in train_dict.keys():
    train_dict[org] = [id]
  else:
    train_dict[org].append(id)

now searching in the distance if to put in train

In [None]:
for row in cleaned_rows:
  org = row[0]
  chromo = row[1]
  ids = row[2:]
  # iterate over the ids
  for i in ids:
    # if id in train already - then we need to put all ids in the row also
    if i in train_dict[org]:
      tmp = [j for j in ids]  # all other ids
      for t in tmp:
        if t not in train_dict[org]:
          train.append([org, chromo, t])
    else:
      all_other.append([org, chromo, i])

make the dict again

In [None]:
train_dict = {}
for seq in train:
  org = seq[0]
  id = seq[2]
  if org not in train_dict.keys():
    train_dict[org] = [id]
  else:
    train_dict[org].append(id)

In [None]:
all_other = [i[0] for i in all_other]

now we have:

* train : a list of all records needs to be on train
* all_other : a list of all records to be split to either train or test

## creating tata and non tata train and test

In [None]:
tata_df = pd.read_csv('/content/drive/MyDrive/פרויקט גמר/DATA/combined positive data/TATA_combined_df.csv')
non_tata_df = pd.read_csv('/content/drive/MyDrive/פרויקט גמר/DATA/combined positive data/non_TATA_combined_df.csv')

we need to remove 'mellifera'

In [None]:
del train_dict['mellifera']

creating 2 lists - to see which items in train are tata and which are non-tata

In [None]:
cleaned_dict = {key: list(set(value)) for key, value in train_dict.items()}
train_dict = cleaned_dict

this cell takes 15 min

In [None]:
tata_train = []
non_tata_train = []

for organism, id_list in train_dict.items():
    for id in id_list:
        # Check if record is in tata_df
        if not tata_df[(tata_df['organism'] == organism) & (tata_df['ID'] == id)].empty:
            tata_train.append((organism, id))
        # Check if record is in non_tata_df
        elif not non_tata_df[(non_tata_df['organism'] == organism) & (non_tata_df['ID'] == id)].empty:
            non_tata_train.append((organism, id))

In [None]:
tata_train = list(set(tata_train))
non_tata_train = list(set(non_tata_train))

TATA

In [None]:
grouped_dict_tata = {}

# Group by organism
for organism, id in tata_train:
    if organism not in grouped_dict_tata.keys():
        grouped_dict_tata[organism] = []
    grouped_dict_tata[organism].append(id)

In [None]:
grouped_dict_non_tata = {}

# Group by organism
for organism, id in non_tata_train:
    if organism not in grouped_dict_non_tata.keys():
        grouped_dict_non_tata[organism] = []
    grouped_dict_non_tata[organism].append(id)

In [None]:
test_dict_tata = {}

In [None]:
print("CURRENT TRAINING DATA:")
print()

print("----------------TATA----------------")

for k,v in grouped_dict_tata.items():
  print("in", k, "we have", len(v), "items")

print()
print("----------------NON-TATA----------------")
for k,v in grouped_dict_non_tata.items():
  print("in", k, "we have", len(v), "items")

CURRENT TRAINING DATA:

----------------TATA----------------
in human we have 2045 items
in norvegicus we have 1544 items
in melanogaster we have 286 items
in musculus we have 2408 items
in mulatta we have 614 items
in celegans we have 182 items
in rerio we have 136 items
in gallus we have 42 items

----------------NON-TATA----------------
in celegans we have 522 items
in musculus we have 7974 items
in mulatta we have 5486 items
in human we have 9869 items
in melanogaster we have 887 items
in norvegicus we have 5943 items
in gallus we have 242 items
in rerio we have 139 items


In [None]:
print("INFO ABOUT REAL AMOUNT IN EACH ORGANISM")
print("----------------TATA----------------")
print()
for name in grouped_dict_tata.keys():
  tmp_df = tata_df[tata_df['organism'] == name]
  print("in", name, "we have total of", len(tmp_df), "items. and 80% will be", int(0.8*len(tmp_df)))

print()
print("----------------NON-TATA----------------")
print()
for name in grouped_dict_non_tata.keys():
  tmp_df = non_tata_df[non_tata_df['organism'] == name]
  print("in", name, "we have total of", len(tmp_df), "items. and 80% will be", int(0.8*len(tmp_df)))

INFO ABOUT REAL AMOUNT IN EACH ORGANISM
----------------TATA----------------

in human we have total of 3065 items. and 80% will be 2452
in norvegicus we have total of 1707 items. and 80% will be 1365
in melanogaster we have total of 2598 items. and 80% will be 2078
in musculus we have total of 3305 items. and 80% will be 2644
in mulatta we have total of 631 items. and 80% will be 504
in celegans we have total of 1013 items. and 80% will be 810
in rerio we have total of 2131 items. and 80% will be 1704
in gallus we have total of 674 items. and 80% will be 539

----------------NON-TATA----------------

in celegans we have total of 6107 items. and 80% will be 4885
in musculus we have total of 21805 items. and 80% will be 17444
in mulatta we have total of 7701 items. and 80% will be 6160
in human we have total of 26533 items. and 80% will be 21226
in melanogaster we have total of 14372 items. and 80% will be 11497
in norvegicus we have total of 10894 items. and 80% will be 8715
in gallus 

In [None]:
grouped_dict_tata_copy = copy.deepcopy(grouped_dict_tata)

In [None]:
for name in grouped_dict_tata_copy.keys():
    # Extract only the rows with specific organism
    tmp_df = tata_df[tata_df['organism'] == name]
    threshold = 0.8 * len(tmp_df)  # Number for the training amount
    curr = len(grouped_dict_tata_copy[name])

    for index, row in tmp_df.iterrows():
        organism = row['organism']
        id = row['ID']

        if id not in grouped_dict_tata_copy[name]:
          if curr < threshold-1:  # Then we can add it to the train
              grouped_dict_tata_copy[name].append(id)
              curr += 1
          else:  # We already have 80% in the train - then add to the test set
              if name not in test_dict_tata.keys():
                  test_dict_tata[name] = [id]
              else:
                  test_dict_tata[name].append(id)

In [None]:
for k,v in test_dict_tata.items():
  data = []
  for id in v:
    tmp = tata_df[(tata_df['organism'] == k) & (tata_df['ID'] == id)]['seq'].iloc[0]
    data.append({'name': k, 'id': id, 'seq': tmp})
  df = pd.DataFrame(data)
  df.to_csv(f'{k}_tata_test.csv')

In [None]:
for k,v in grouped_dict_tata_copy.items():
  data = []
  for id in v:
    tmp = tata_df[(tata_df['organism'] == k) & (tata_df['ID'] == id)]['seq'].iloc[0]
    data.append({'name': k, 'id': id, 'seq': tmp})
  df = pd.DataFrame(data)
  df.to_csv(f'{k}_tata_train.csv')

In [None]:
print("CURRENT TRAINING INFO")
print("----------------TATA----------------")
print()
for name in grouped_dict_tata_copy.keys():
  print("in", name, "we have total of", len(grouped_dict_tata_copy[name]))

CURRENT TRAINING INFO
----------------TATA----------------

in human we have total of 2451
in norvegicus we have total of 1544
in melanogaster we have total of 2078
in musculus we have total of 2643
in mulatta we have total of 614
in celegans we have total of 810
in rerio we have total of 1704
in gallus we have total of 539


In [None]:
print("INFO ABOUT REAL AMOUNT IN EACH ORGANISM")
print("----------------TATA TEST----------------")
print()
for name in test_dict_tata.keys():
  print("in", name, "we have total of", len(test_dict_tata[name]))

INFO ABOUT REAL AMOUNT IN EACH ORGANISM
----------------TATA TEST----------------

in human we have total of 614
in norvegicus we have total of 163
in melanogaster we have total of 520
in musculus we have total of 662
in mulatta we have total of 17
in celegans we have total of 203
in rerio we have total of 427
in gallus we have total of 135


non-TATA

In [None]:
grouped_dict_non_tata = {}

# Group by organism
for organism, id in non_tata_train:
    if organism not in grouped_dict_non_tata.keys():
        grouped_dict_non_tata[organism] = []
    grouped_dict_non_tata[organism].append(id)

In [None]:
test_dict_non_tata = {}

In [None]:
grouped_dict_non_tata_copy = copy.deepcopy(grouped_dict_non_tata)

In [None]:
for name in grouped_dict_non_tata_copy.keys():
    # Extract only the rows with specific organism
    tmp_df = non_tata_df[non_tata_df['organism'] == name]
    threshold = 0.8 * len(tmp_df)  # Number for the training amount
    curr = len(grouped_dict_non_tata_copy[name])

    for index, row in tmp_df.iterrows():
        organism = row['organism']
        id = row['ID']

        if id not in grouped_dict_non_tata_copy[name]:
          if curr < threshold-1:  # Then we can add it to the train
              grouped_dict_non_tata_copy[name].append(id)
              curr += 1
          else:  # We already have 80% in the train - then add to the test set
              if name not in test_dict_non_tata.keys():
                  test_dict_non_tata[name] = [id]
              else:
                  test_dict_non_tata[name].append(id)

this cell takes 34 min

In [None]:
for k,v in grouped_dict_non_tata_copy.items():
  data = []
  for id in v:
    tmp = non_tata_df[(non_tata_df['organism'] == k) & (non_tata_df['ID'] == id)]['seq'].iloc[0]
    data.append({'name': k, 'id': id, 'seq': tmp})
  df = pd.DataFrame(data)
  df.to_csv(f'{k}_non_tata_train.csv')

this cell takes 7 min

In [None]:
for k,v in test_dict_non_tata.items():
  data = []
  for id in v:
    tmp = non_tata_df[(non_tata_df['organism'] == k) & (non_tata_df['ID'] == id)]['seq'].iloc[0]
    data.append({'name': k, 'id': id, 'seq': tmp})
  df = pd.DataFrame(data)
  df.to_csv(f'{k}_non_tata_test.csv')

In [None]:
print("CURRENT TRAINING INFO")
print("----------------non-TATA----------------")
print()
for name in grouped_dict_non_tata_copy.keys():
  print("in", name, "we have total of", len(grouped_dict_non_tata_copy[name]))

CURRENT TRAINING INFO
----------------non-TATA----------------

in celegans we have total of 4885
in musculus we have total of 17443
in mulatta we have total of 6160
in human we have total of 21226
in melanogaster we have total of 11497
in norvegicus we have total of 8715
in gallus we have total of 4361
in rerio we have total of 6877


In [None]:
print("INFO ABOUT REAL AMOUNT IN EACH ORGANISM")
print("----------------nn-TATA TEST----------------")
print()
for name in test_dict_non_tata.keys():
  print("in", name, "we have total of", len(test_dict_non_tata[name]))

INFO ABOUT REAL AMOUNT IN EACH ORGANISM
----------------nn-TATA TEST----------------

in celegans we have total of 1222
in musculus we have total of 4362
in mulatta we have total of 1541
in human we have total of 5307
in melanogaster we have total of 2875
in norvegicus we have total of 2179
in gallus we have total of 1091
in rerio we have total of 1720


#### Validation

In [None]:
celegans_train_tata = pd.read_csv('/content/train_tata/celegans_tata_train.csv')
gullus_train_tata = pd.read_csv('/content/train_tata/gallus_tata_train.csv')
human_train_tata = pd.read_csv('/content/train_tata/human_tata_train.csv')
melanogaster_train_tata = pd.read_csv('/content/train_tata/melanogaster_tata_train.csv')
mulata_train_tata = pd.read_csv('/content/train_tata/mulatta_tata_train.csv')
musculus_train_tata = pd.read_csv('/content/train_tata/musculus_tata_train.csv')
norvegicus_train_tata = pd.read_csv('/content/train_tata/norvegicus_tata_train.csv')
rerio_train_tata = pd.read_csv('/content/train_tata/rerio_tata_train.csv')


celegans_train_non_tata = pd.read_csv('/content/celegans_non_tata_train.csv')
gullus_train_non_tata = pd.read_csv('/content/gallus_non_tata_train.csv')
human_train_non_tata = pd.read_csv('/content/human_non_tata_train.csv')
melanogaster_train_non_tata = pd.read_csv('/content/melanogaster_non_tata_train.csv')
mulata_train_non_tata = pd.read_csv('/content/mulatta_non_tata_train.csv')
musculus_train_non_tata = pd.read_csv('/content/musculus_non_tata_train.csv')
norvegicus_train_non_tata = pd.read_csv('/content/musculus_non_tata_train.csv')
rerio_train_non_tata = pd.read_csv('/content/rerio_non_tata_train.csv')

In [None]:
combined_train_tata = pd.concat([celegans_train_tata,gullus_train_tata,human_train_tata,melanogaster_train_tata,mulata_train_tata,musculus_train_tata,norvegicus_train_tata,rerio_train_tata])

In [None]:
combined_train_non_tata = pd.concat([celegans_train_non_tata,gullus_train_non_tata,human_train_non_tata,melanogaster_train_non_tata,mulata_train_non_tata,musculus_train_non_tata,norvegicus_train_non_tata,rerio_train_non_tata])

In [None]:
# make new dict from distances
new_dict = {}
for row in cleaned_rows:
  org = row[0]
  chromo = row[1]
  ids = row[2:]
  if org not in new_dict.keys():
    new_dict[org] = [ids]
  else:
    new_dict[org].append(ids)

TATA

In [None]:
new_train_tata, new_val_tata = {}, {}

for name in new_dict.keys():
    new_train_tata[name] = []
    new_val_tata[name] = []
    distances = new_dict[name]

    for d in distances:
      tmp = False
      for i in d:
        if i in list(combined_train_tata[combined_train_tata['name'] == name]['id']):
          tmp = True
        else:
          tmp=False
          break
      if tmp:
          if len(new_train_tata[name]) < int(0.8 * len(combined_train_tata[combined_train_tata['name'] == name])):
              for i in d:
                  new_train_tata[name].append(i)
          else:
              for i in d:
                  new_val_tata[name].append(i)
      else:
        break

In [None]:
for name in set(list(combined_train_tata['name'])):
  tmp = combined_train_tata[combined_train_tata['name'] == name]
  for index, row in tmp.iterrows():
    id = row['id']
    seq = row['seq']
    if id in new_train_tata[name] or id in new_val_tata[name]:
      continue
    else:
      if len(new_train_tata[name]) < 0.8 * len(combined_train_tata[combined_train_tata['name'] == name]):
        new_train_tata[name].append(id)
      else:
        new_val_tata[name].append(id)

non - TATA

In [None]:
new_train_non_tata, new_val_non_tata = {}, {}

for name in new_dict.keys():
    new_train_non_tata[name] = []
    new_val_non_tata[name] = []
    distances = new_dict[name]

    for d in distances:
      tmp = False
      for i in d:
        if i in list(combined_train_non_tata[combined_train_non_tata['name'] == name]['id']):
          tmp = True
        else:
          tmp=False
          break
      if tmp:
          if len(new_train_non_tata[name]) < int(0.8 * len(combined_train_non_tata[combined_train_non_tata['name'] == name])):
              for i in d:
                  new_train_non_tata[name].append(i)
          else:
              for i in d:
                  new_val_non_tata[name].append(i)
      else:
        break

In [None]:
for name in set(list(combined_train_non_tata['name'])):
  tmp = combined_train_non_tata[combined_train_non_tata['name'] == name]
  for index, row in tmp.iterrows():
    id = row['id']
    seq = row['seq']
    if id in new_train_non_tata[name] or id in new_val_non_tata[name]:
      continue
    else:
      if len(new_train_non_tata[name]) < 0.8 * len(combined_train_non_tata[combined_train_non_tata['name'] == name]):
        new_train_non_tata[name].append(id)
      else:
        new_val_non_tata[name].append(id)

save to csv

In [None]:
for k,v in new_train_tata.items():
  data = []
  for id in v:
    tmp = tata_df[(tata_df['organism'] == k) & (tata_df['ID'] == id)]['seq'].iloc[0]
    data.append({'name': k, 'id': id, 'seq': tmp})
  df = pd.DataFrame(data)
  df.to_csv(f'{k}_tata_new_train.csv')

In [None]:
for k,v in new_val_tata.items():
  data = []
  for id in v:
    tmp = tata_df[(tata_df['organism'] == k) & (tata_df['ID'] == id)]['seq'].iloc[0]
    data.append({'name': k, 'id': id, 'seq': tmp})
  df = pd.DataFrame(data)
  df.to_csv(f'{k}_tata_new_val.csv')

this cell takes 23 min

In [None]:
for k,v in new_train_non_tata.items():
  data = []
  for id in v:
    tmp = non_tata_df[(non_tata_df['organism'] == k) & (non_tata_df['ID'] == id)]['seq'].iloc[0]
    data.append({'name': k, 'id': id, 'seq': tmp})
  df = pd.DataFrame(data)
  df.to_csv(f'{k}_non_tata_new_train.csv')

In [None]:
for k,v in new_val_non_tata.items():
  data = []
  for id in v:
    tmp = non_tata_df[(non_tata_df['organism'] == k) & (non_tata_df['ID'] == id)]['seq'].iloc[0]
    data.append({'name': k, 'id': id, 'seq': tmp})
  df = pd.DataFrame(data)
  df.to_csv(f'{k}_non_tata_new_val.csv')

#### Genome

In [None]:
from sklearn.model_selection import train_test_split
genome_neg_merged = pd.read_csv('/content/drive/MyDrive/פרויקט גמר/DATA/negative genome files/merged_negative_genome.csv')
train_df, test_df = train_test_split(genome_neg_merged, test_size=0.2, random_state=42)

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
train_df.to_csv('train_df_genome.csv')
test_df.to_csv('test_df_genome.csv')
val_df.to_csv('val_df_genome.csv')