### Create Directory Tree

In [None]:
import os

def create_folder_if_not_exists(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)

create_folder_if_not_exists('raw')
create_folder_if_not_exists('processed')
create_folder_if_not_exists('raw/chr-fa-data')
create_folder_if_not_exists('processed/chr-fa-data')

### Download FASTA Files and Annotations 

In [None]:
import requests
import gzip

def download_chromosome(chromosome):
    url = f"http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr{chromosome}.fa.gz"
    response = requests.get(url)
    if response.status_code == 200:
        decompressed_content = gzip.decompress(response.content)
        with open(f"raw/chr-fa-data/chr{chromosome}.fa", "wb") as file:
            file.write(decompressed_content)
        print("Download and unzip complete")
    else:
        print("Failed to download chromosome", chromosome)



for i in range(1, 23):
    print("Downloading chromosome", i)
    download_chromosome(i)

In [None]:
import requests
import gzip

def download_annotations():
    url = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_46/gencode.v46.annotation.gtf.gz"
    response = requests.get(url)
    if response.status_code == 200:
        # Decompress the content
        decompressed_content = gzip.decompress(response.content)
        # Write the decompressed content to a file
        with open("raw/gencode.v46.annotation.gtf", "wb") as file:
            file.write(decompressed_content)
        print("Download and unzip complete")
    else:
        print("Failed to download annotations")

download_annotations()

### Process FASTA file to leave only raw dna data

In [1]:
def process_chr_fa_data(chr_id):
    # Read the content of the file
    with open(f"raw/chr-fa-data/chr{chr_id}.fa", "r") as file:
        content = file.read()
    # Remove lines starting with ">"
    sequence = ''.join([line for line in content.split('\n') if not line.startswith('>')])
    sequence = sequence.replace('\n', '')
    # Annotations are one indexed so add a space at the beginning and end of the sequence
    sequence = ' '+sequence+' '
    with open(f"processed/chr-fa-data/chr{chr_id}.txt", "w") as file:
        file.write(sequence)

for i in range(1, 23):
    print("Processing chromosome", i)
    process_chr_fa_data(i)

Processing chromosome 1
Processing chromosome 2
Processing chromosome 3
Processing chromosome 4
Processing chromosome 5
Processing chromosome 6
Processing chromosome 7
Processing chromosome 8
Processing chromosome 9
Processing chromosome 10
Processing chromosome 11
Processing chromosome 12
Processing chromosome 13
Processing chromosome 14
Processing chromosome 15
Processing chromosome 16
Processing chromosome 17
Processing chromosome 18
Processing chromosome 19
Processing chromosome 20
Processing chromosome 21
Processing chromosome 22


### Read the annotation file and create a separate annotation csv file for each chromosome

In [2]:
import pandas as pd

def create_chromosome_annotations(chrm_id):
    annotation_file = "raw/gencode.v46.annotation.gtf"
    col_names = ['chr_name', 'source', 'type', 'start', 'end', '.', 'strand', ',', 'other']
    
    ann = pd.read_csv(annotation_file, sep='\t', comment='#', header=None)
    ann.rename(columns={i: col_name for i, col_name in enumerate(col_names)}, inplace=True)
    ann = ann[col_names]
    
    chrm = f'chr{chrm_id}'
    chr_no = ann[ann['chr_name'] == f'chr{chrm_id}']
    del chr_no['.']
    del chr_no[',']
    
    # Extract gene names
    chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
    
    # Extract transcript ids
    def get_transcript_id(x):
        transcript_part = x.split(';')[1].split(' ')
        
        if transcript_part[1] == 'transcript_id':
            return transcript_part[2]
        return None
            
        
    chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))
    
    del chr_no['other']
    
    chr_no.reset_index(drop=True, inplace=True)
    chr_no.to_csv(f"processed/chr-fa-data/{chrm}_annotations.csv", index=False, sep='\t')
    

for i in range(1, 23):
    print("Creating annotations for chromosome", i)
    create_chromosome_annotations(i)

    

Creating annotations for chromosome 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 17


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 18


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 19


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


Creating annotations for chromosome 22


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['gene_id'] = chr_no['other'].apply(lambda x: x.split(';')[0].split(' ')[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chr_no['transcript_id'] = chr_no['other'].apply(lambda x: get_transcript_id(x))


### Generating Dataset

For each exon in a transcript of a gene we generate a random fully exon subsequence, a fully intron subsequence and a subsequence that contains both an intron part and an exon part (splice site)

In [23]:
import pandas as pd
import random

SEQUENCE_LENGTH = 100
MIN_MIX = 20
MAX_MIX = 70

random.seed(42)

COMPLEMENTS = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't', 't': 'a', 'c': 'g', 'g': 'c'}

def get_nonoverlapping_exon_bounds(exon_ranges_in_transcripts):
    '''
    Function to get non-overlapping exon bounds
    :param exon_ranges_in_transcripts: list of lists (list of ranges)
    :return: list of lists (list of ranges)
    '''

    exon_ranges_in_transcripts.sort(key=lambda x: x[0])
    merged = []
    for interval in exon_ranges_in_transcripts:
        if len(merged) == 0 or merged[-1][1] < interval[0]:
            merged.append(interval)
        else:
            merged[-1][1] = max(merged[-1][1], interval[1])
    
    return merged
    

def deal_with_negative_strand(sequence):
    sequence = sequence[::-1] # Reverse
    sequence = ''.join([COMPLEMENTS[base] for base in sequence]) # Take complements
    return sequence

def generate_dataset(chrm_id):
    chrm_txt_file = f"processed/chr-fa-data/chr{chrm_id}.txt"
    chrm_ann_file = f"processed/chr-fa-data/chr{chrm_id}_annotations.csv"
    
    COLUMNS = ['gene_id','transcript_id', 'start', 'end', 'sequence', 'label']
    final_df = pd.DataFrame(columns=COLUMNS)
    
    with open(chrm_txt_file, "r") as file:
        sequence = file.read()
    ann_grouped = pd.read_csv(chrm_ann_file, sep='\t').groupby('transcript_id')
    
    for transcript_id, group in ann_grouped:
        gene_id = group['gene_id'].values[0]
        transcript_start = group[group['type'] == 'transcript']['start'].min()
        transcript_end = group[group['type'] == 'transcript']['end'].max()
        strand = group['strand'].values[0]
        
        exon_ranges = []
        for _, row in group[group['type'] == 'exon'].iterrows():
            exon_ranges.append((row['start'], row['end']))
    
        if len(exon_ranges) >= 0:
            nonoverlapping_exon_ranges_for_gene = get_nonoverlapping_exon_bounds(exon_ranges)
            
            for i in range(len(nonoverlapping_exon_ranges_for_gene)):
                exon_start = nonoverlapping_exon_ranges_for_gene[i][0]
                exon_end = nonoverlapping_exon_ranges_for_gene[i][1]
                
                if exon_end - exon_start  <= SEQUENCE_LENGTH or exon_start - transcript_start <= SEQUENCE_LENGTH or transcript_end - exon_end <= SEQUENCE_LENGTH:
                    continue
                
                if i > 0 and exon_start - nonoverlapping_exon_ranges_for_gene[i-1][1] <= SEQUENCE_LENGTH:
                    continue
                
                random_exon_start = random.randint(exon_start, exon_end-SEQUENCE_LENGTH)
                random_exon_end = random_exon_start + SEQUENCE_LENGTH
                
                random_fully_exon_part = sequence[random_exon_start:random_exon_end]
                
                # if first just get random intron from transcript start
                if i == 0:
                    intron_part_start = random.randint(transcript_start, exon_start - SEQUENCE_LENGTH)
                    intron_part_end = intron_part_start + SEQUENCE_LENGTH
                    random_fully_intron_part = sequence[intron_part_start:intron_part_end]
                else: # get intron from between exons
                    intron_part_start = random.randint(nonoverlapping_exon_ranges_for_gene[i-1][1], exon_start - SEQUENCE_LENGTH)
                    intron_part_end = intron_part_start + SEQUENCE_LENGTH
                    random_fully_intron_part = sequence[intron_part_start:intron_part_end]
                    
                
                if random.choice([True, False]): # sequence starts in exon
                    boundary_part_end = random.randint(exon_start + MIN_MIX, exon_start + MAX_MIX)
                    boundary_part_start = boundary_part_end - SEQUENCE_LENGTH
                    random_boundary_part = sequence[boundary_part_start:boundary_part_end]
                else: # sequence ends in exon
                    boundary_part_start = random.randint(exon_end - MAX_MIX, exon_end - MIN_MIX)
                    boundary_part_end = boundary_part_start + SEQUENCE_LENGTH
                    random_boundary_part = sequence[boundary_part_start:boundary_part_end]
                
                if strand == '-': # take complement for negative strand
                    random_fully_exon_part = deal_with_negative_strand(random_fully_exon_part)
                    random_fully_intron_part = deal_with_negative_strand(random_fully_intron_part)
                    random_boundary_part = deal_with_negative_strand(random_boundary_part)
                
                exon_row = pd.DataFrame([[gene_id, transcript_id, random_exon_start, random_exon_end, random_fully_exon_part, 'exon']], columns=COLUMNS)
                intron_row = pd.DataFrame([[gene_id, transcript_id, intron_part_start, intron_part_end, random_fully_intron_part, 'intron']], columns=COLUMNS)
                boundary_row = pd.DataFrame([[gene_id, transcript_id, boundary_part_start, boundary_part_end, random_boundary_part, 'boundary']], columns=COLUMNS)
                
                final_df = pd.concat([final_df, exon_row, intron_row, boundary_row])
    
    final_df.to_csv(f"processed/chr-fa-data/chr{chrm_id}_dataset.csv", index=False)

generate_dataset(21)    

In [27]:
def make_training_dataset(chrm_id):
    dataset_file = f"processed/chr-fa-data/chr{chrm_id}_dataset.csv"
    
    dataset = pd.read_csv(dataset_file)
    
    new_dataset = dataset[['sequence', 'label']]
        
    new_dataset['label'] = new_dataset['label'].apply(lambda x: 1 if x == 'boundary' else 0)
    new_dataset.to_csv(f"processed/chr-fa-data/chr{chrm_id}_training_dataset.csv", index=False)
    
make_training_dataset(21)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dataset['label'] = new_dataset['label'].apply(lambda x: 1 if x == 'boundary' else 0)
