# Create the 'padding'

We extend the seq_df with three nucleotides before and three after each 2003 long sequence in the df

In [1]:
import pandas as pd
from gnomad_db.database import gnomAD_DB
from pathlib import Path
from Bio import SeqIO
import numpy as np
import pyranges as pr
import math 

  import pkg_resources


In [9]:
seqs_df = pd.read_csv("/s/project/ml4rg_students/2025/project07/data/gtf_start_extended_ints_df_2003_seq.csv")
seqs_df.head()

Unnamed: 0,Chromosome,Start,End,Strand,seq,seq_len
0,chr1,63564,65567,+,TATCGATGGGCACCTTCTTTTTCTTAATTGTATCATACATTTTTAT...,2003
1,chr1,922431,924434,+,AGAAGACACAGACTTCAGGAGAGGAAGGCACAGGAACTCACTGGCA...,2003
2,chr1,923941,925944,+,TCCCCGCCGGGCGGGCGCGCGCCAGTGGACGCGGGTGCACGACTGA...,2003
3,chr1,958693,960696,+,TCGGGAAGAGATTTTTGCACAACTCACCAACATACGCTCCCTGCCT...,2003
4,chr1,964531,966534,+,TCCGCAGTGGGGCTGCGGGGAGGGGGGCGCGGGTCCGCAGTGGGGC...,2003


In [29]:
row = row[['Chromosome', 'Start', 'End']]
row.at[0, 'Start'] = 63563
row.at[0, 'End'] = 65568
row


Unnamed: 0,Chromosome,Start,End
0,chr1,63563,65568


In [34]:
input_df = seqs_df[['Chromosome', 'Start', 'End']].copy()
input_df['Start'] -= 3
input_df['End'] += 3
input_df

Unnamed: 0,Chromosome,Start,End
0,chr1,63561,65570
1,chr1,922428,924437
2,chr1,923938,925947
3,chr1,958690,960699
4,chr1,964528,966537
...,...,...,...
30055,chr22,50577909,50579918
30056,chr22,50582775,50584784
30057,chr22,50627773,50629782
30058,chr22,50627366,50629375


In [14]:
fasta_path = Path("/s/project/ml4rg_students/2025/project07/data/GRCh38.primary_assembly.genome.fa")

def fetch_sequence_from_region_df(regions_df, model_window_size, fasta_path=fasta_path):
    '''
    Fetches DNA sequences from genomic regions using a reference FASTA file.
    Input:
        - regions_df (pd.DataFrame): A DataFrame containing genomic regions with at least
                                     'Chromosome', 'Start', and 'End' columns.
        - model_window_size (int): The expected length of each DNA sequence to extract.
        - fasta_path (str): Path to the reference genome FASTA file from which sequences are extracted.

    Output:
        - seqs_df (pd.DataFrame): A new DataFrame including the original region information
                                  plus the following additional columns:
                                  * 'seq': the uppercase DNA sequence
                                  * 'seq_len': the length of the sequence (should equal model_window_size)
    '''
    
    #Added comment: Make a copy of the input DataFrame to avoid modifying the original
    seqs_df = regions_df.copy()

    seqs_df["index"] = np.arange(
            len(seqs_df)
        )  # Pyranges changes the ordering of the rows when converting to PyRanges. we first create a new column with the original order

    
    #Added comment: Convert the pandas DataFrame to a PyRanges object, which is optimized for genomic interval operations
    
    # create a pyranges df from the pandas df
    seqs_pr = pr.PyRanges(seqs_df)

    #Added comment: Fetch DNA sequences for each region using the FASTA file at fasta_path
    # The sequences are stored in the `.seq` attribute of the PyRanges object
    seqs_pr.seq = pr.get_sequence(
        seqs_pr, path=fasta_path
    )  # retrieve sequenc based on prepared start and end columns

    
    seqs_df = seqs_pr.df.copy()  # convert to pandas df

    seqs_df = (
        seqs_df.sort_values("index").reset_index(drop=True).copy()
    )  # sort the dataframe back to the original order

    #Added comment: Drop the temporary index column as it is no longer needed
    seqs_df = seqs_df.drop(columns=["index"])

    #Added comment: Convert all sequences to uppercase (standard for DNA: A, T, C, G)
    seqs_df["seq"] = seqs_df["seq"].apply(lambda x: x.upper())  # make seq uppercase
    
    seqs_df["seq_len"] = seqs_df["seq"].apply(lambda x: len(x))  # add a sequence length

    # assert  that the sequence length is the same as the model window size
    assert (
        seqs_df["seq_len"] == model_window_size
    ).all(), f"{seqs_df['seq_len']} is not {model_window_size}"


    return seqs_df

In [35]:
new_seqs_df = fetch_sequence_from_region_df(input_df, 2009)
new_seqs_df

Unnamed: 0,Chromosome,Start,End,seq,seq_len
0,chr1,63561,65570,TCATATCGATGGGCACCTTCTTTTTCTTAATTGTATCATACATTTT...,2009
1,chr1,922428,924437,GGGAGAAGACACAGACTTCAGGAGAGGAAGGCACAGGAACTCACTG...,2009
2,chr1,923938,925947,AAGTCCCCGCCGGGCGGGCGCGCGCCAGTGGACGCGGGTGCACGAC...,2009
3,chr1,958690,960699,AGATCGGGAAGAGATTTTTGCACAACTCACCAACATACGCTCCCTG...,2009
4,chr1,964528,966537,AGGTCCGCAGTGGGGCTGCGGGGAGGGGGGCGCGGGTCCGCAGTGG...,2009
...,...,...,...,...,...
30055,chr22,50577909,50579918,CGCCATCCTGGGGGTTGGTCGGCACCTAGGACGGGGGCAGATGGGT...,2009
30056,chr22,50582775,50584784,CGCCATGGCGCGGGCTCGACCGGGCCCCAGGCCAGGCTGCGCTCCG...,2009
30057,chr22,50627773,50629782,GGACATGGGACCGAGGGGTCTGTCCCAAGAGAGGGAGGGCTACTTG...,2009
30058,chr22,50627366,50629375,GCCCATCCGAACCGGGAGCCGGCCGGTCAGGAGGGCGGCCCTGCGG...,2009


In [36]:
'TATCGATGGGCACCTTCTTTTTCTTAATTGTATCATACATTTT'== 'TATCGATGGGCACCTTCTTTTTCTTAATTGTATCATACATTTT'

True

In [37]:
new_seqs_df.to_csv("/s/project/ml4rg_students/2025/project07/group_2/data/seqs_2009.csv", index=False)


# Trying for the neutral model notebook

In [7]:
data_location = '/s/project/benchmark-lm/ssd-cache'
fasta = Path("/s/project/ml4rg_students/2025/project07/data/GRCh38.primary_assembly.genome.fa")
seqs = pd.read_csv("/s/project/ml4rg_students/2025/project07/group_2/data/seqs_2009.csv")

In [8]:
db = gnomAD_DB(data_location, gnomad_version="v4")

In [28]:
def prep(chrom, start, end):
    db_region = db.get_info_for_interval(chrom=chrom, interval_start=start, interval_end = end, query="*")

    db_region = db_region[db_region['filter'] == 'PASS'] 

    interval_length = end - start
    db_region['len_ref'] = db_region['ref'].apply(len)
    db_region['len_alt'] = db_region['alt'].apply(len)  
    db_region['is_snv'] = (db_region['len_ref'] == 1) & (db_region['len_alt'] == 1)

    db_region = db_region[db_region['is_snv']].reset_index(drop=True)

    return db_region

In [29]:
region = prep(1, 10000, 20000)
region

Unnamed: 0,chrom,pos,ref,alt,filter,AC,AN,AF,MQ,QD,...,AN_grpmax,AF_grpmax,AF_eas,AF_nfe,AF_fin,AF_afr,AF_asj,len_ref,len_alt,is_snv
0,1,10111,C,A,PASS,1.0,44330.0,0.000023,35.0416,1.38889,...,,,0.000000,0.000000,0.000487,0.000000,0.000000,1,1,True
1,1,10131,C,A,PASS,1.0,105956.0,0.000009,37.0788,2.20376,...,46046.0,0.000022,0.000000,0.000022,0.000000,0.000000,0.000000,1,1,True
2,1,10139,A,T,PASS,1.0,58784.0,0.000017,36.9422,4.42722,...,14494.0,0.000069,0.000000,0.000000,0.000000,0.000069,0.000000,1,1,True
3,1,10140,A,C,PASS,2.0,72760.0,0.000027,36.4297,11.88040,...,35206.0,0.000057,0.000000,0.000057,0.000000,0.000000,0.000000,1,1,True
4,1,10141,C,G,PASS,1.0,50842.0,0.000020,36.3017,4.64246,...,26636.0,0.000038,0.000000,0.000038,0.000000,0.000000,0.000000,1,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,1,19591,G,A,PASS,9.0,70134.0,0.000128,23.8685,4.30368,...,3876.0,0.001548,0.001548,0.000000,0.000000,0.000116,0.000000,1,1,True
662,1,19600,A,T,PASS,1.0,66084.0,0.000015,23.7667,4.18158,...,24606.0,0.000041,0.000000,0.000000,0.000000,0.000041,0.000000,1,1,True
663,1,19664,G,T,PASS,1.0,67516.0,0.000015,24.6777,7.29310,...,,,0.000000,0.000000,0.000000,0.000000,0.000000,1,1,True
664,1,19911,G,A,PASS,10.0,50892.0,0.000196,25.626,5.01126,...,3586.0,0.001115,0.000000,0.000052,0.000000,0.000196,0.001289,1,1,True
