In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re
import requests
import json

In [8]:
da = pd.read_csv('NCH-microarray-CNVs.csv')

In [9]:
da.head()

Unnamed: 0,variant,build
0,22q11.21q11.23(21798705_23751189)x3,GRCh37
1,7q35(146581450_146661741)x3,GRCh37
2,18p11.32(146484_408044)x1,GRCh37
3,3q25.1(150353937_150944378)x1,GRCh37
4,16p12.1(27864452_27995317)x3,GRCh37


In [10]:
len(da)

19106

### Remove duplicates

In [11]:
da = da.drop_duplicates('variant')
len(da)

14710

## Compute and check chromosome values

In [12]:
da['chromosome'] = da['variant'].apply(lambda v: re.findall(r"([\dXY]+)[pq]",v)[0])

In [13]:
da['chromosome'].unique()

array(['22', '7', '18', '3', '16', 'X', '2', '17', '1', '20', '5', '6',
       '9', '14', '10', '11', '13', '12', '21', '4', '8', '15', '19', 'Y',
       '1919'], dtype=object)

In [14]:
# clean incorrect values
da['chromosome'] = da['chromosome'].replace('1919','19')

## Compute start/stop values

In [15]:
da['start'] = da['variant'].apply(lambda v: re.findall(r"\(([\d,]+)",v)[0].replace(',','')).astype(int)

In [16]:
da['stop'] = da['variant'].apply(lambda v: re.findall(r"([\d,]+)\)",v)[-1].replace(',','')).astype(int)

### Identify errors in start/stop values

In [17]:
da['var_len'] = da['stop'] - da['start'] + 1

In [18]:
def count_endpoints(v):
    numranges = re.findall("\([\d,\-\_]+\)",v)[0]
    numranges = re.sub(',','',numranges)
    return len(re.findall("\d+",numranges))

da['#position_values'] = da['variant'].apply(count_endpoints)
if list(da['#position_values'].unique()) == [1]:
    del da['#position_values']

#### Remove malformed variants: incorrect number of positions

In [19]:
da[(da['#position_values']!=2)]

Unnamed: 0,variant,build,chromosome,start,stop,var_len,#position_values
5252,"13q31.3(90,653,47091,471,740)x3",GRCh38,13,9065347091471740,9065347091471740,1,1
5803,"15q21.3(55,499,672,55,593,588)x1",GRCh38,15,5549967255593588,5549967255593588,1,1
5940,"1p36.33p34.3(1,341,185-35,888,147-35,888,147)x...",GRCh38,1,1341185,35888147,34546963,3
12101,"1p33(47,070,474-47,282-269)x3",GRCh37,1,47070474,269,-47070204,3
13758,"1q21.1(144,998,070-146,179-215)x1",GRCh37,1,144998070,215,-144997854,3
15978,"12q13.12(49,370,149-49-678,707)x3",GRCh38,12,49370149,678707,-48691441,3
16859,"20q13.2q13.32(52,096,110-57-389,480)x2 hmz",GRCh38,20,52096110,389480,-51706629,3
17417,"8q24.21q24.22(129,951,769-133-621,197)x2 hmz",GRCh38,8,129951769,621197,-129330571,3


In [20]:
da = da[(da['#position_values']==2)].copy()
len(da)

14702

#### Remove Malformed variants: start > stop

In [21]:
da[
#    (da['#endpoints']==2) &
    (da['var_len']<0)].drop_duplicates('variant')

Unnamed: 0,variant,build,chromosome,start,stop,var_len,#position_values
4295,"6q22.1q23.2(1117,662,682-134,504,204)x2 hmz",GRCh38,6,1117662682,134504204,-983158477,2
4962,"6q16.1(95,032,6225-95,297,694)x3",GRCh38,6,950326225,95297694,-855028530,2
6081,10q22.3(793155275_79370971)x1,GRCh37,10,793155275,79370971,-713784303,2
6193,22q13.31q13.3(345611578_51178150)x1,GRCh37,22,345611578,51178150,-294433427,2
8303,9p24.1(9014741_5165768)x1,GRCh37,9,9014741,5165768,-3848972,2
10562,"9q22.2(914,974,185-92,471,302)x1",GRCh37,9,914974185,92471302,-822502882,2
13673,"15q26.3(98,111,496-97,413,981)x3",GRCh37,15,98111496,97413981,-697514,2
14905,"2q34(212,681,263-21,889,508)x1",GRCh37,2,212681263,21889508,-190791754,2
14938,"8q11.1q11.21(47,042,437-48,401,60)x3",GRCh37,8,47042437,4840160,-42202276,2
15142,"5q35.3(180,380,290-100,448,046)x3",GRCh37,5,180380290,100448046,-79932243,2


In [22]:
da = da[da['var_len']>=0].copy()
len(da)

14683

## Compute copy number/ranges for each variant

In [23]:
da['copy_number'] = da['variant'].apply(lambda v: re.findall(r"\) ?x?[ ]?([\d~\-]*)",v)[0])
da['copy_number_min'] = da['copy_number'].apply(lambda x: int(x[0]))
da['copy_number_max'] = da['copy_number'].apply(lambda x: int(x[-1]))

### Check copy number values

In [24]:
sorted(set(da['copy_number']))

['0',
 '0-2',
 '0~1',
 '1',
 '1-2',
 '1~2',
 '1~3',
 '2',
 '2-3',
 '2~3',
 '3',
 '3~4',
 '4',
 '4~5',
 '5~6']

In [25]:
sorted(set(da['copy_number_min']))

[0, 1, 2, 3, 4, 5]

In [26]:
sorted(set(da['copy_number_max']))

[0, 1, 2, 3, 4, 5, 6]

### Regions of Homozygosity ("hmz") variants not to be considered in this analysis
#### How many variants in our data with CN=2 are autosomal and not regions of homozygosity?

In [27]:
da['region_of_homozygosity'] = da['variant'].str.contains('hmz')

In [28]:
da['region_of_homozygosity'].value_counts()

False    9017
True     5666
Name: region_of_homozygosity, dtype: int64

In [29]:
da = da[da['region_of_homozygosity']==False].copy()
len(da)

9017

In [30]:
da[
    (da['copy_number']=='2') &
    (~da['chromosome'].isin(['X','Y']))
]

Unnamed: 0,variant,build,chromosome,start,stop,var_len,#position_values,copy_number,copy_number_min,copy_number_max,region_of_homozygosity
122,2p13.2p12(73227186_79042754)x2,GRCh37,2,73227186,79042754,5815569,2,2,2,2,False
4122,"7p22.2p12.3(3,264,728-46,071,829)x2",GRCh38,7,3264728,46071829,42807102,2,2,2,2,False
4396,"3p25.3p24.1(10,514,008-29,532,824)x2",GRCh38,3,10514008,29532824,19018817,2,2,2,2,False
5301,"8q12.3q21.13(64,242,384-83,258,323)x2",GRCh38,8,64242384,83258323,19015940,2,2,2,2,False
5329,"20q11.21q13.2(29,991,219-50,728,172)x2",GRCh38,20,29991219,50728172,20736954,2,2,2,2,False
5341,"10p15.3p15.1(307,807-5,146,667)x2",GRCh38,10,307807,5146667,4838861,2,2,2,2,False
7192,2q11.2q12.2(98703659_106160498)x2,GRCh37,2,98703659,106160498,7456840,2,2,2,2,False
7595,12q23.3q24.31(108904090_124118489)x2,GRCh37,12,108904090,124118489,15214400,2,2,2,2,False
15220,"11q24.1(122,984,093-123,106,251)x2",GRCh37,11,122984093,123106251,122159,2,2,2,2,False
15416,"6q21q22.3(114,543,815-129,022,028)x2",GRCh38,6,114543815,129022028,14478214,2,2,2,2,False


#### After inspection of orginal reports, it appears that these are all actually regions of homozygosity and minor typos/defects in regex matching strategy caused them not to be recognized as such
* "hmm" instead of "hmz"
* "," between "x2" and rest of variant, but "," was used as a tokenizing character between variant strings

In [31]:
da = da[
    (da['copy_number']!='2') |
    (da['chromosome'].isin(['X','Y']))
].copy()
len(da)

9000

### Remove any variants in build GRCh36

In [32]:
da = da[da['build'].isin(['GRCh37','GRCh38'])]
len(da)

8989

In [33]:
da = da.reset_index()
del da['index']

# Normalize NCH CNV Data

In [27]:
endpoint_cn = "http://variation-normalization-dev-eb.us-east-2.elasticbeanstalk.com/variation/parsed_to_cn_var?"

In [28]:
norm_responses = []
for ix,cnv in tqdm(da.iterrows(), total=len(da)):
    args = {
        'assembly': cnv['build'],
        'chromosome': f"chr{cnv['chromosome']}",
        'start0': int(cnv['start']),
        'start_pos_type': 'Number',
        'end0': int(cnv['stop']),
        'end_pos_type': 'Number'
    }
    if cnv['copy_number_min'] == cnv['copy_number_max']:
        args['copies0'] = int(cnv['copy_number_min'])
        args['copies_type'] = 'Number'
    else:
        args['copies0'] = int(cnv['copy_number_min'])
        args['copies1'] = int(cnv['copy_number_max'])
        args['copies_type'] = 'DefiniteRange'
        
    args['do_liftover'] = True
    args['untranslatable_returns_text'] = False

    rec = dict(cnv)
    rec.update(requests.post(endpoint_cn, json=args).json())
    norm_responses.append(rec)

  0%|          | 0/8989 [00:00<?, ?it/s]

In [29]:
with open('NCH-normalizer-results.json', 'w') as f:
    json.dump(norm_responses, f)

In [30]:
dr = pd.json_normalize(norm_responses)
dr.head()

Unnamed: 0,variant,build,chromosome,start,stop,var_len,#position_values,copy_number,copy_number_min,copy_number_max,...,copy_number_count.subject.sequence_id,copy_number_count.subject.start.type,copy_number_count.subject.start.value,copy_number_count.subject.end.type,copy_number_count.subject.end.value,copy_number_count.copies.type,copy_number_count.copies.value,copy_number_count.copies.min,copy_number_count.copies.max,copy_number_count
0,22q11.21q11.23(21798705_23751189)x3,GRCh37,22,21798705,23751189,1952485,2,3,3,3,...,ga4gh:SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ,Number,21444415.0,Number,23409002.0,Number,3.0,,,
1,7q35(146581450_146661741)x3,GRCh37,7,146581450,146661741,80292,2,3,3,3,...,ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul,Number,146884357.0,Number,146964649.0,Number,3.0,,,
2,18p11.32(146484_408044)x1,GRCh37,18,146484,408044,261561,2,1,1,1,...,ga4gh:SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV,Number,146483.0,Number,408044.0,Number,1.0,,,
3,3q25.1(150353937_150944378)x1,GRCh37,3,150353937,150944378,590442,2,1,1,1,...,ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX,Number,150636149.0,Number,151226590.0,Number,1.0,,,
4,16p12.1(27864452_27995317)x3,GRCh37,16,27864452,27995317,130866,2,3,3,3,...,ga4gh:SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0,Number,27853130.0,Number,27983996.0,Number,3.0,,,


In [31]:
dr['warning_string'] = dr['warnings'].apply(';'.join)

In [32]:
def reduce_warning(warning):
    warning = re.sub('ga4gh\:SQ\.[A-Za-z0-9\-\_]+','ga4gh:SQ.*',warning)
    warning = re.sub('chr[XY\d]+','chr#',warning)
    warning = re.sub('\(\d+\)','#',warning)
    warning = re.sub('pos \d+','pos #',warning)
    
    return warning

In [33]:
dr['warning_string_reduce'] = dr['warning_string'].apply(reduce_warning)
dr['warning_string_reduce'].value_counts()

                                         8718
Unable to liftover: chr# with pos #       208
Position # is not valid on ga4gh:SQ.*      63

In [134]:
### get the normalized lifted over coordinates
da['start_38'] = dr['copy_number_count.subject.start.value']
da['stop_38'] = dr['copy_number_count.subject.end.value']

In [142]:
### restrict to variants with valid normalized position
da = da[da['start_38'].notna() & da['stop_38'].notna()].copy()
da['start_38'] = da['start_38'].astype(int)
da['stop_38'] = da['stop_38'].astype(int)

In [143]:
len(da)

8718

In [144]:
da[['variant','build','chromosome','start','stop','start_38','stop_38','copy_number','copy_number_min','copy_number_max']].to_csv('NCH-microarray-CNVs-cleaned.csv',index=False)