# Task 1. Preprocessing genomic data

In [2]:
import pandas as pd
import numpy as np

from utils import *

In [3]:
# Load the tsv data
tsv = pd.read_csv(parameters['tsv_file'], sep='\t')
tsv.head()

Unnamed: 0,Sample name,Sex,Biosample ID,Population code,Population name,Superpopulation code,Superpopulation name,Population elastic ID,Data collections
0,NA19625,female,SAME123655,ASW,African Ancestry SW,AFR,African Ancestry,ASW,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."
1,NA19835,female,SAME125029,ASW,African Ancestry SW,AFR,African Ancestry,ASW,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."
2,NA19900,male,SAME125050,ASW,African Ancestry SW,AFR,African Ancestry,ASW,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."
3,NA19917,female,SAME125272,ASW,African Ancestry SW,AFR,African Ancestry,ASW,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."
4,NA19703,male,SAME124230,ASW,African Ancestry SW,AFR,African Ancestry,ASW,"1000 Genomes on GRCh38,1000 Genomes 30x on GRC..."


In [4]:
tsv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2504 entries, 0 to 2503
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Sample name            2504 non-null   object
 1   Sex                    2504 non-null   object
 2   Biosample ID           2504 non-null   object
 3   Population code        2504 non-null   object
 4   Population name        2504 non-null   object
 5   Superpopulation code   2504 non-null   object
 6   Superpopulation name   2504 non-null   object
 7   Population elastic ID  2504 non-null   object
 8   Data collections       2504 non-null   object
dtypes: object(9)
memory usage: 176.2+ KB


In [5]:
# Select a random patient name from the tsv data
random_patient = np.random.choice(tsv['Sample name'].unique())
print(random_patient)

NA19670


In [6]:
df_sample = pd.read_csv(parameters['sample_data_path'] + random_patient + '.csv', sep=',')
df_sample.head()

Unnamed: 0.1,Unnamed: 0,ALT,NA19670
0,5;195139,T,0
1,5;336952,C,1
2,5;389603,C,0
3,5;851582,A,1
4,5;1144802,C,1


In [7]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10028 entries, 0 to 10027
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  10028 non-null  object
 1   ALT         10028 non-null  object
 2   NA19670     10028 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 235.2+ KB


In [8]:
for file in tsv['Sample name']:
    df_sample = pd.read_csv(parameters['sample_data_path'] + file + '.csv', sep=',')
    if len(df_sample) != 10028:
        df_sample.head()
        df_sample.info()
        break

In [9]:
# Create a dataframe
master_df = pd.DataFrame()

In [10]:
# Add the columns 'Unnamed: 0' and 'ALT' to the dataframe
master_df['chr:location'] = df_sample['Unnamed: 0']
master_df['alternative'] = df_sample['ALT']
master_df.head()

Unnamed: 0,chr:location,alternative
0,5;195139,T
1,5;336952,C
2,5;389603,C
3,5;851582,A
4,5;1144802,C


In [11]:
# Loop over the 'Sample name' column and use the data to read the csv file
# for each sample
for patient in tsv['Sample name']:
    df_patient = pd.read_csv(parameters['sample_data_path'] + patient + '.csv', sep=',')
    # Rename the columns
    df_patient.columns = ['chr:location', 'alternative', patient]
    # Join the patient value to the master dataframe if chr:location and alternative are the same
    master_df = master_df.merge(df_patient, left_on=['chr:location', 'alternative'], right_on=['chr:location', 'alternative'])

In [12]:
# Replace the ; with : in the chr:location column
master_df['chr:location'] = master_df['chr:location'].str.replace(';', ':')

In [13]:
master_df.head()

Unnamed: 0,chr:location,alternative,NA19625,NA19835,NA19900,NA19917,NA19703,NA20274,NA20351,NA20356,...,NA19117,NA19129,NA19131,NA19256,NA19198,NA19201,NA19206,NA19213,NA19225,NA19143
0,5:195139,T,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5:336952,C,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,5:389603,C,1,1,1,1,1,0,1,1,...,1,1,1,1,1,1,1,0,1,1
3,5:851582,A,1,1,0,0,0,1,0,1,...,1,0,1,1,1,1,1,0,0,1
4,5:1144802,C,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [14]:
# Save the master dataframe to a csv file
master_df.to_csv(parameters['output_data_path'] + 'master.csv', sep=',', index=False)