In [1]:
## Open the input file
import pandas
from datetime import date
from collections import Counter
from pyliftover import LiftOver

today = date.today()
lo = LiftOver('hg18', 'hg19') #Loads LiftOver's convertion tables

# input and output file
input_file = 'Breakpoints_cancer_all.xlsx'
output_file  = 'dataset_Mix_{}.txt'.format(today)


comment_line = '#This file was generated using parse_Mix.ipynb'\
               ' using {} as input and {} as output.\n'.format(input_file,output_file)

header       = '##{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                                                                   'chr1','s1','e1','o1',
                                                                   'chr2','s2','e2','o2',
                                                                   'source','sample_name','sv_type','cancer_type')

# study:cancer_type table
cancer_type_dict = {'Baca_Cell_2013':'prostate cancer',
                    'Berger_Nature_2011':'prostate cancer',
                    'Weischenfeldt_CancerCell_2013':'prostate cancer',
                    'Zhen_GenesDev_2013':'prostate cancer',
                    'Brastianos_NatGenet_2013':'meningioma',
                    'Bass_NatGenet_2011':'colorectal cancer',
                    'Malhotra_GenomeRes_2013':'multiple',
                    'Drier_GenomeRes_2013':'multiple',
                    'Campbell_NatGenet_2008':'breast/ovary',
                    'McBride_Jpathology_2012':'breast/ovary',
                    'Campbell_Nature_2010':'pancreatic cancer',
                    'Hillmer_GenomeRes_2011':'breast/gastric',
                    'Jiao_BMCGenomics_2013':'breast cancer',
                    'Stephens_Nature_2009':'breast cancer',
                    'Mehine_NEJM_2013':'uterine leiomyomas',
                    'Morin_Blood_2013':'large B-cell lymphoma',
                    'Nazaryan_EJHG_2013':'chromothripsis',
                    'Rausch_Cell_2012':'pedriatic medulloblastoma',
                    'Sausen_NatGenet_2013':'neuroblastoma',
                    'Yang_Cell_2013':'ovarian cancer'
                   }

data = pandas.io.excel.read_excel(input_file) #parse Excel's file

In [2]:
data.head()

Unnamed: 0,Study,Sample_name,Chr1,start1,stop1,orientation1,Chr2,start2,stop2,orientation2,TH/HT,type,assembly,platform,Unnamed: 14
0,Baca_Cell_2013,P01-28,1,2380382,2380382,-,19,49682633,49682633,-,,inter_chr,hg19,Illumina,
1,Baca_Cell_2013,P01-28,1,2392425,2392425,+,19,47899862,47899862,+,,inter_chr,hg19,Illumina,
2,Baca_Cell_2013,P01-28,1,6205156,6205156,-,17,15257808,15257808,-,,inter_chr,hg19,Illumina,
3,Baca_Cell_2013,P01-28,1,8621065,8621065,+,17,7853514,7853514,-,,inter_chr,hg19,Illumina,
4,Baca_Cell_2013,P01-28,1,9659853,9659853,-,17,6917291,6917291,-,,inter_chr,hg19,Illumina,


In [3]:
data.tail()

Unnamed: 0,Study,Sample_name,Chr1,start1,stop1,orientation1,Chr2,start2,stop2,orientation2,TH/HT,type,assembly,platform,Unnamed: 14
69973,Zhen_GenesDev_2013,TCGA-41-5651-01A-01D-1696-08,12,57844268,57844268,,12,58188705,58188705,,,?,hg19,?,
69974,Zhen_GenesDev_2013,TCGA-41-5651-01A-01D-1696-08,12,58020083,58020083,,12,58190232,58190232,,,?,hg19,?,
69975,Zhen_GenesDev_2013,TCGA-41-5651-01A-01D-1696-08,12,58195843,58195843,,12,58199635,58199635,,,?,hg19,?,
69976,Zhen_GenesDev_2013,TCGA-41-5651-01A-01D-1696-08,12,77453388,77453388,,12,78408686,78408686,,,?,hg19,?,
69977,Zhen_GenesDev_2013,TCGA-41-5651-01A-01D-1696-08,15,83551619,83551619,,15,83557672,83557672,,,?,hg19,?,


In [4]:
## Exctract data
lines = data[['Chr1','start1','stop1','orientation1',
              'Chr2','start2','stop2','orientation2',
              'assembly', 'Study', 'type', 'Sample_name']]

breakpoints = {} #store breakpoints by chromosome
total_lines = 0 #number of lines in the file
failed = 0 #lines without good breaks
success = 0 #lines with good breaks
skipped = 0
unique_id = Counter()

with open(output_file, 'w') as f:
    f.write(comment_line)
    f.write(header)
    
    for line in lines.iterrows():
        skip_line = False #To skip lines with coordinates that fails LiftOver

        #Found 3 breaks relative to Chr6_qbl_hap2
        #Found 2 breaks relative to Chr6_cox_hap1
        #Checked on the input file, seams to be ok to just convert it to Chr6

        chr1 = str(line[1][0]).strip().replace('23','X').replace('24','Y').replace('6_qbl_hap2','6').replace('6_cox_hap1','6')        
        s1   = int(line[1][1])
        e1   = int(line[1][2])
        o1   = line[1][3]

        chr2 = str(line[1][4]).strip().replace('23','X').replace('24','Y').replace('6_qbl_hap2','6').replace('6_cox_hap1','6')
        s2   = int(line[1][5])
        e2   = int(line[1][6])   
        o2   = line[1][7]

        assembly = line[1][8]

        source = line[1][9] #Study
        sv_type = line[1][10] #type
        sample_name = line[1][11] #Sample_name

        cancer_type = cancer_type_dict[source]


        ## LiftOver
        if '19' not in assembly: #if not hg19 (the others are hg18, also Berger 2011)
            x = 0
            for coordinate in [(chr1,s1),(chr1,e1),(chr2,s2),(chr2,e2)]:
                s = lo.convert_coordinate('chr'+coordinate[0], coordinate[1])

                if len(s)==1 and len(s[0])==4: #if I get the right LiftOver output
                    if x==0: s1=s[0][1]
                    if x==1: e1=s[0][1]
                    if x==2: s2=s[0][1]
                    if x==3: e2=s[0][1]
                else:
                    skip_line = True #Let's just not consider any line with one ore more dubious coordinates
                    skipped += 1
                    #print('LiftOver failed with the following coordinates',coordinate,'Output:',s)
                x += 1

        if not skip_line:
            out_line = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(chr1,s1,e1,o1,
                                                                         chr2,s2,e2,o2,
                                                                         source,sample_name,sv_type,cancer_type)
            f.write(out_line)
            unique_id.update({sample_name})
            success += 1
            


        else:
            failed += 1
        total_lines += 1



report = \
'''
####REPORT####
dataset name :Mix
input file   :{}
output file  :{}

lines parsed :{}
succesfully  :{}
with errors  :{}
skipped      :{}

total breaks saved  :{}
unique IDs          :{}
'''.format(input_file,output_file,
           total_lines,success,failed,skipped,
           success*2,len(unique_id))
print(report)


####REPORT####
dataset name :Mix
input file   :Breakpoints_cancer_all.xlsx
output file  :dataset_Mix_2015-10-27.txt

lines parsed :69978
succesfully  :69938
with errors  :40
skipped      :61

total breaks saved  :139876
unique IDs          :562

