# SCRIPT TO GENERATE COVARIATES

## This script should be only run once
## This script can be run in the computer terminal and the output be submitted to the UK BioBank Project

#### Initialization 
##### Load packages

In [1]:
import pandas as pd

##### Load file

In [2]:
# Set the path to your file
file_path = 'dragen_pvcf_coordinates.csv'

# Read the CSV file
data = pd.read_csv(file_path, header=0, names=["filename", "chromosome", "starting_position"])

##### Formatting

In [3]:
# Create an empty list to store the output
output = []

# Iterate over the rows and extract the necessary parts
for idx, row in data.iterrows():
    filename = row["filename"]
    chromosome = row["chromosome"]
    starting_position = row["starting_position"]
    
    # Extract chromosome number (after 'c') and block number (after 'b')
    chrom_num = filename.split('_c')[1].split('_')[0]
    block_num = int(filename.split('_b')[1].split('_')[0])
    
    # Append the row with index, chromosome number, block number, and starting position
    output.append([idx + 1, chrom_num, block_num, starting_position])

# Convert the output list to a DataFrame for easy viewing
output_df = pd.DataFrame(output, columns=["Row_Number", "Chromosome", "Block", "Ending_Position"])

# Replace 'X' with 23 and 'Y' with 24 in the 'Chromosome' column
output_df["Chromosome"] = output_df["Chromosome"].replace({"X": 23, "Y": 24})

# Convert 'Chromosome' to numeric for correct sorting
output_df["Chromosome"] = pd.to_numeric(output_df["Chromosome"], errors='coerce')

# Sort the DataFrame by 'Chromosome' and 'Block'
sorted_df = output_df.sort_values(by=["Chromosome", "Block"])

# Calculate the 'Ending_Position' and 'Starting_Position'
# Initialize the Starting_Position column
sorted_df["Starting_Position"] = 0


In [11]:

# Calculate 'Starting_Position' and 'Ending_Position'
for chrom in sorted_df["Chromosome"].unique():
    chrom_df = sorted_df[sorted_df["Chromosome"] == chrom]
    
    prev_ending_pos = 0
    for index, row in chrom_df.iterrows():
        block = row["Block"]
        
        if block == 0:
            starting_pos = 1
        else:
            # Get the Ending_Position of the previous block
            prev_block_ending_pos = sorted_df.loc[
                (sorted_df["Chromosome"] == chrom) & (sorted_df["Block"] == block - 1),
                "Ending_Position"
            ]
            
            if not prev_block_ending_pos.empty:
                starting_pos = prev_block_ending_pos.values[0] + 1
            else:
                starting_pos = 1  # Default value or handle error if previous block not found
        
        sorted_df.loc[index, "Starting_Position"] = starting_pos

# Add a new 'Row_Number' column
sorted_df["Row_Number"] = range(1, len(sorted_df) + 1)

# Convert the entire 'Starting_Position' and 'Ending_Position' columns to integers
sorted_df['Starting_Position'] = sorted_df['Starting_Position'].astype(int)
sorted_df['Ending_Position'] = sorted_df['Ending_Position'].astype(int)

# Reorder the columns
sorted_df = sorted_df[["Row_Number", "Chromosome", "Block", "Starting_Position", "Ending_Position"]]

# Remove NaN values
sorted_df = sorted_df.dropna()

# Save the output to a new TSV file
sorted_df.to_csv('dragen_pvcf_blocks.tsv', sep='\t', index=False)
