In [None]:
import random
from random import randrange
import numpy as np
import pandas as pd
import os

nucleotides = ['A', 'T', 'C', 'G']
seq_length = 128
size = 2500
max_mutations = 1

def mutate_at_index(sequence, index):

  # makes one random mutation of the nucleotide at defined index in sequence
  # e.g. mutate_at_index('ACTG', 1) may return 'AGTG', 'AATG', or 'ATTG'

  nucleotide_mutated = sequence[index]
  nucleotides_excluding_nucleotide_mutated = ['A', 'T', 'C', 'G']
  nucleotides_excluding_nucleotide_mutated.remove(nucleotide_mutated)
  mutated_sequence = sequence[:index] + random.choice(nucleotides_excluding_nucleotide_mutated) + sequence[index+1:]
  return mutated_sequence


def mutate(sequence, no_of_nucleotides):

  # makes multiple mutations of nucleotides at random indices of sequence, with the total number of mutations defined by no_of_nucleotides
  # e.g. mutate('ACTG', 2) could return 'GGTG', 'AGAG', 'TCTA'... or any other combinations with 2 nucleotides mutated

  mutate_indices = []

  for i in range(no_of_nucleotides):
    mutate_index = randrange(seq_length)
    mutate_indices.append(mutate_index)

  for i in mutate_indices:
    sequence = mutate_at_index(sequence, i)
    
  return sequence


def generate_seq(number_of_seq, no_of_nucleotides):

  # makes np array containing number_of_seq number of sequences, each with no_of_nucleotides number of mutations
  # e.g. generate_seq(2, 1) could return [['ATTCGGGAGA' 'ATCCGGGAGA'] ['ACACACAACT' 'ACCCACAACT']]

  sequence_data = np.zeros(shape=(number_of_seq,2), dtype=object)

  for x in range(number_of_seq):

    seq_pair = []

    # to create a random sequence of length seq_length
    sequence_list = random.choices(nucleotides, k=seq_length)
    sequence = ''.join(sequence_list)
    seq_pair.append(sequence)

    # mutate the sequence
    seq_pair.append(mutate(sequence, no_of_nucleotides))

    # add seq_pair to numpy array
    sequence_data[x] = seq_pair

  return sequence_data


def generate_dataset(max_mutations):

  # makes np array containing an even distribution of sequences with 1 nucleotide mutated, 2 nucleotides mutated, 3 nucleotides mutated...
  # up to and including max_mutations mutated
  # if size cannot be divided evenly, all groups will have a rounded down number of sequences except 1 nucleotide mutated which will carry all remainders
  # i.e. if size = 10 and max_mutations = 3, 4 sequences will have 1 nucleotides mutated, 3 sequences will have 2 nucleotides mutated, and 3 sequences will have 3 nucleotides mutated

  group_size = int(size/max_mutations)
  first_group_size = int(size - (max_mutations - 1)*group_size)

  dataset = generate_seq(first_group_size, 1)

  for i in range(1, max_mutations):
    group = generate_seq(group_size, i+1)
    dataset = np.concatenate((dataset, group), axis=0)

  return dataset


def main():

  # generates dataset comprising pairs of sequences with different numbers of nucleotides mutated
  sequence_data = generate_dataset(max_mutations)

  # create Pandas dataframe
  column_names = ['seq1', 'seq2']
  dataframe = pd.DataFrame(data=sequence_data, columns=column_names)
  print(dataframe)

  # save pd dataframe as csv
  from google.colab import drive
  drive.mount('/content/gdrive')
  os.chdir('/content/gdrive/My Drive/divergence_hashing')
  dataframe.to_csv('/content/gdrive/My Drive/divergence_hashing/2500.csv')


if __name__ == "__main__":
    main()

                                                   seq1                                               seq2
0     TGCAGCTCAATTGCCTGAGGGAGAAAAGGAGCATCGATCGTAAGCC...  TGCAGCTCAATTGCCTGAGGGAGGAAAGGAGCATCGATCGTAAGCC...
1     GATATAGGAAATCGCCCAATCCGCGAATCCATCTATAACGCACCAG...  GATATAGGAAATCGCCCAATCCGCGAATCTATCTATAACGCACCAG...
2     ACTTGAAGTTTGAAGTCAATGTTCAAACCACGGAAAGGAAAGGGAC...  ACTTGAAGTTTGAAGTCAATGTTCAAACCACGGAAAGGAATGGGAC...
3     AGCGGCCGTACGACGAGAACAGTACTGACCCGCTTAGGCGGGTCCT...  AGCGGCCGTACGACGAGAACAGTACTGACCCGCTTAGGCGGGTCCT...
4     TTTTCGCGTCTAAGTTGTCGGGGCGGATACGCGAGAGGGGCGTCGG...  TTTTCGCGTCTAAGTTGTCGGGGCGGATACGCGAGAGGGGCGTCGG...
...                                                 ...                                                ...
2495  GGTCGTTGAGTGACAGTGAAAGGCACGTGGCTTGGTTTCGATCCGA...  GGTCGTTGAGTGACAGTGAAAGGCACGTGGCTTGGTTTCGATCCGA...
2496  ATGCGTGTGCTCCCCGGACAGAACTATATCCAGTCCAGATACAAGC...  ATGCGTGGGCTCCCCGGACAGAACTATATCCAGTCCAGATACAAGC...
2497  CACAGACGCAACGTAGTCGTTGCCCGTTACT