# SCRIPT TO OBTAIN START AND END OF DRAGEN PVCF BLOCKS

## This script should be only run once
## This script does not need to be run in UK BioBank RAP, but the output has to be submitted to it
## Output can also be obtained from the data/misc folder

#### Initialization 
##### Load packages

In [1]:
import pandas as pd

##### Load file
##### This file can be obtained from: https://biobank.ctsu.ox.ac.uk/showcase/refer.cgi?id=2009

In [2]:
# Set the path to your file
file_path = 'dragen_pvcf_coordinates.csv'

# Read the CSV file
data = pd.read_csv(file_path, header=0, names=["filename", "chromosome", "starting_position"])

##### Formatting

In [13]:
# Remove rows with unknown chromosome or starting position
data = data.dropna()

In [45]:
# Create an empty list to store the output
output = []

# Iterate over the rows and extract the necessary parts
for idx, row in data.iterrows():
    filename = row["filename"]
    chromosome = row["chromosome"]
    starting_position = row["starting_position"]
    
    # Extract chromosome number (after 'c') and block number (after 'b')
    chrom_num = filename.split('_c')[1].split('_')[0]
    block_num = int(filename.split('_b')[1].split('_')[0])
    
    # Append the row with index, chromosome number, block number, and starting position
    output.append([idx + 1, chrom_num, block_num, starting_position])

# Convert the output list to a DataFrame for easy viewing
output_df = pd.DataFrame(output, columns=["Row_Number", "Chromosome", "Block", "Starting_Position"])

# Replace 'X' with 23 and 'Y' with 24 in the 'Chromosome' column
output_df["Chromosome"] = output_df["Chromosome"].replace({"X": 23, "Y": 24})

# Convert 'Chromosome' to numeric for correct sorting
output_df["Chromosome"] = pd.to_numeric(output_df["Chromosome"], errors='coerce')

# Sort the DataFrame by 'Chromosome' and 'Block'
sorted_df = output_df.sort_values(by=["Chromosome", "Block"])


In [52]:
# Calculate 'Starting_Position' and 'Ending_Position'
for chrom in sorted_df["Chromosome"].unique():
    chrom_df = sorted_df[sorted_df["Chromosome"] == chrom]
    
    for i in range(len(chrom_df)):
        row = chrom_df.iloc[i,]

        if i + 1 < len(chrom_df):
            # Get the Starting_Position of the next row (next block)
            next_block_starting_pos = chrom_df.iloc[i + 1]["Starting_Position"]
            ending_pos = next_block_starting_pos - 1
        else:
            # If there's no next row, assign a default large value
            ending_pos = 999999999

        sorted_df.loc[chrom_df.index[i], "Ending_Position"] = ending_pos


# Add a new 'Row_Number' column
sorted_df["Row_Number"] = range(1, len(sorted_df) + 1)

# Reorder the columns
sorted_df = sorted_df[["Row_Number", "Chromosome", "Block", "Starting_Position", "Ending_Position"]]

# Remove NaN values
sorted_df = sorted_df.dropna()

# Convert the entire 'Starting_Position' and 'Ending_Position' columns to integers
sorted_df['Starting_Position'] = sorted_df['Starting_Position'].astype(int)
sorted_df['Ending_Position'] = sorted_df['Ending_Position'].astype(int)

# Save the output to a new TSV file
sorted_df.to_csv('dragen_pvcf_blocks.tsv', sep='\t', index=False)