# JVJ65 Merge all needed samples data in one CSV file

In [37]:
import pandas as pd
import os
import seaborn as sns, numpy as np

In [21]:
def load_sample (Name_file):
    '''
    Input:VCF file
    Output1: VCF data (pandas data frame)
    Output2: Sample name (W123456789)
    '''
    with open(Name_file, 'r') as f:
        for line in f:
            if line.startswith('#') and len(line)>2 and line[1] != '#':
                columns = line[1:-1].split('\t')
                file_data = pd.read_csv(Name_file, comment='#', delimiter='\t', names=columns)
                name_sample = file_data.columns[-1]
                break
    return file_data, name_sample

In [22]:
def data_manipulation(file_data,name_sample):
    '''
    Input: VCF data
    Output: The selected data we need for comparison
    '''
    # 1) Create a new column with the name of the sample.
    file_data['name_sample'] = name_sample
    
    # 2) Create a variant ID (name_sample+CHROM+POS+REF+">"+ALT) in a new column
    file_data['Sample_ID'] = file_data["name_sample"]+file_data["CHROM"]+":"+file_data["POS"].astype(str)+file_data["REF"]+">"+file_data["ALT"]
    
    # 3) To get GT, AD and SB values I need to put this data in new columns
    ### VERY CAREFUL AT THIS POINT. DIFFERENT VCF FILES CAN HAVE DIFFERENT FORMAT STRUCTURE IN THIS COLUMN ###
    
    tmp_df = file_data.apply(lambda x:dict(zip(x.FORMAT.split(":"),x[name_sample].split(":"))), axis=1).apply(pd.Series)
    
    file_data = pd.concat([file_data, tmp_df], axis=1)
    
    # 4) Return the columns we need
    file_data_selec = file_data[['Sample_ID',"QUAL","FILTER","GT","AD","SB"]]
    return file_data_selec

In [35]:
def itinerante_by_files():
    '''
    Itinerante over the samples in Data_input folder
    '''
    PATH_FileNames = ['./Data_input/' + file_name for file_name in os.listdir("./Data_input/")]
    colNames = ['Sample_ID',"QUAL","FILTER","GT","AD","SB"]
    masterVCF = pd.DataFrame(columns = colNames)
    for Name_file in PATH_FileNames:
        print(Name_file)
        file_data,name_sample = load_sample(Name_file)
        file_data_selec = data_manipulation(file_data,name_sample)
        masterVCF = masterVCF.append(file_data_selec,ignore_index=True)
    print("Total number of variants: ",len(masterVCF))
    
    
    # Change the file name manually. This is to enchure that we create the desire file
    return masterVCF.to_csv('./Data_output/JVJ65_Win10_Stitch_NoReal.csv')


In [36]:
itinerante_by_files()

./Data_input/W2114579_S4.vcf
./Data_input/W2114447_S13.vcf
./Data_input/W2114249_S3.vcf
./Data_input/W2113856_S16.vcf
./Data_input/W2112623_S7.vcf
./Data_input/W2114554_S14.vcf
./Data_input/W2114485_S9.vcf
./Data_input/W2114559_S2.vcf
./Data_input/W2114486_S10.vcf
./Data_input/W2114480_S1.vcf
./Data_input/W2112426_S8.vcf
./Data_input/W2114529_S15.vcf
./Data_input/W2114578_S5.vcf
./Data_input/W2114280_S12.vcf
./Data_input/W2114514_S11.vcf
./Data_input/W2114560_S6.vcf
Total number of variants:  13655


This must have generated a csv file in the output folder.
Make sure and verify that the generated document is the desired one. All subsequent analysis is carried out based on these files!!

With JVJ65 make sure that the win10 runs are the ones you believe looking at SampleSheetUsed

In [13]:
test = load_sample ("./Data_input/W2112623_S7.vcf")[0]
test.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,W2112623_S7.bam
0,chr1,36931729,.,A,G,6,q20,DP=737;phyloP=-0.648;CSQT=1|MRPS15|NM_031280.3...,GT:GQ:AD:DP:VF:NL:SB:NC,"0/1:6:729,8:737:0.0109:21:-11.5169:0.0041"
1,chr1,36931843,.,A,G,6,q20,DP=729;phyloP=-0.02;CSQT=1|MRPS15|NM_031280.3|...,GT:GQ:AD:DP:VF:NL:SB:NC,"0/1:6:721,8:729:0.0110:21:-8.2498:0.0149"
2,chr1,36933096,rs3918000,T,C,100,PASS,DP=543;GMAF=C|0.1344;AA=C;AF1000G=0.134385;phy...,GT:GQ:AD:DP:VF:NL:SB:NC,"0/1:100:266,276:543:0.5083:22:-100.0000:0.0216"
3,chr1,36933120,.,TG,T,27,PASS,DP=553;GMAF=A|0.0007987;CSQT=1|MRPS15|NM_03128...,GT:GQ:AD:DP:VF:NL:SB:NC,"0/1:27:545,8:553:0.0145:24:-15.7127:0.0000"
4,chr1,36933288,rs754325073,C,T,20,PASS,DP=324;phyloP=0.234;CSQT=1|MRPS15|NM_031280.3|...,GT:GQ:AD:DP:VF:NL:SB:NC,"0/1:20:319,5:324:0.0154:24:-8.7917:0.0122"


In [20]:
display(test[test["POS"] == 15808795])

#W2112623_S7.bamchrX:15808795T>.

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,W2112623_S7.bam
572,chrX,15808795,.,T,.,100,PASS,DP=790;RefMinor;GMAF=T|0.002649;phyloP=-2.424;...,GT:GQ:AD:DP:VF:NL:SB:NC,0/0:18:788:790:0.0025:24:-100.0000:0.1023
