In [1]:
import os
import sys
import argparse
import pandas as pd
import csv
import numpy as np

from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 2000
display(HTML("<style>.container {width:90% !important;}</style>"))
sys.path.append("/Users/mariapalafox/Desktop/Toolbox")
from all_funx import *
from maplib import *

# MAP CADDv1.4 annotations (for detected & non-detected CK pos) to dbNSFP score files

(12.27.19 mfpfox on local, other M part on hoffman)

### VEP VERSION NOTE: 
    - CADDv1.4 using VEPv92
    - dbNSFP using VEPv94

**hoffman PATH : /u/home/m/mfpalafo/project-arboleda/CADD/PULL_annotations**
    * DIR downloaded locally for part 3 QC "RESULT_pos_overlap_dbNSFPcoordinates/"
    * hoffman files have all consequences

**hoffman M markdown has code for:** 
- [1] chr chunks
- [2] search pos against CADD files

**this M markdwon has code for:** 
- [3] filter CADDv1.4 files for MISSENSE consequence only, format AA col and pos_id col
- [4] simplify dbNSFP cols
- [5] merge missense cadd files & merge with dbNSFP score files
- [6] QC merge CADD+dbNSFP annotation files

| [1] | [2] | [3] | 
|----|----|----- |
| break up search files with det / not det coordinates into chr chunks | search dbNSFP derived coordinates for CK codon positions against CADDannotations(GRCh37 and GRCh38 model) | filter CADDv1.4 files w/ dbNSFP CK positions for MISSENSE consequence only, add A/A column, and pos_id |


| [4] | [5] | [6] |
| ----|-----|----|
 |simplify dbNSFP detected and notdect files for select columns | merge missense cadd files & merge with dbNSFP score files | QC merge CADD+dbNSFP annotation files |

# [5] CADDv1.4 Missense merge chr files together 

#### from append.sh
```bash
#!/bin/bash
# keeps header and appends all rows together
# for Pmap dbNSFP CADD mapped detected and non detected chr merge

# 19 NOT DETECTED FILES
awk -F ',' 'FNR==1 && NR!=1 { while (/^pos_hg19/) getline; } 1 {print}' MISSENSE_chr*_CADD_GRCh37_NOT_DETECTED_CK.csv > SCORE_CADD_37_NOT_detected_CK.csv

# 19 DETECTED FILES
awk -F ',' 'FNR==1 && NR!=1 { while (/^pos_hg19/) getline; } 1 {print}' MISSENSE_chr*_CADD_GRCh37_DETECTED_CK.csv > SCORE_CADD_37_detected_CK.csv

# same for hg38 files just replace hg19 with hg38 and file names

```

---


# CADD results {line count} {filename}

### DETECTED hg19 : 
* 106,176 SCORE_CADD_37_detected_CK.csv (1,700 more rows than dbNSFP)

### DETECTED hg38  : 
* 106,816 SCORE_CADD_38_detected_CK.csv (2,340 more rows than dbNSFP)

### NOT DECT hg19 : 
* 1,234,967 SCORE_CADD_37_NOT_detected_CK.csv (12,055 more rows than dbNSFP)

### NOT DECT hg38 :
* 1,240,470 SCORE_CADD_38_NOT_detected_CK.csv (17,558 more rows than dbNSFP)


# dbNSFP results:
### DETECTED : 
* 104,476 SCORE_dbNSFP_selectcols_detected_CK_104475.csv

### NOT DECT : 
* 1,222,912 SCORE_dbNSFP_selectcols_NOT_detected_CK_1222911.csv

In [2]:
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/CADDmapped/RESULT_CADDv14_pos_overlap_dbNSFPcoordinates/MISSENSE_FILTERED/MERGE_CHR")
print(os.listdir())

['SCORE2_dbNSFP_selectcols_NOT_detected_CK_1222911.csv', 'SCORE_dbNSFP_selectcols_detected_CK_104475.csv', 'SCORE2_CADD_37_NOT_detected_CK.csv', 'DUPLICATE_great_posid_detected19_cadd_1786.csv', 'SCORE2_CADD_38_NOT_detected_CK.csv', 'SCORE_CADD_37_detected_CK.csv', 'SCORE_CADD_38_detected_CK.csv', 'SCORE2_CADD_37_detected_CK.csv', 'dbnsfp_featureID_notdetected_description.csv', 'SCORE_dbNSFP_selectcols_NOT_detected_CK_1222911.csv', 'SCORE_CADD_38_NOT_detected_CK.csv', 'SCORE2_CADD_38_detected_CK.csv', 'dbnsfp_featureID_detected_description.csv', 'SCORE_CADD_37_NOT_detected_CK.csv', 'SCORE2_dbNSFP_selectcols_detected_CK_104475.csv']


---
---

# REDUNDANCY in pos_id CADD columns is problem for merging with dbNSFP files

In [4]:
# saving files with correct pos_id
det19=pd.read_csv('SCORE_CADD_37_detected_CK.csv', low_memory=False)
det38=pd.read_csv('SCORE_CADD_38_detected_CK.csv',  low_memory=False)
not19=pd.read_csv('SCORE_CADD_37_NOT_detected_CK.csv', low_memory=False)
not38=pd.read_csv('SCORE_CADD_38_NOT_detected_CK.csv',  low_memory=False)

In [6]:
dfls = [det19, not19]
for f in dfls:
    print(len(list(f.pos_id19)))
    print(len(set(f.pos_id19)))
    


106175
104225
1234966
1217997


In [7]:
dfls = [det38,  not38]
for f in dfls:
    print(len(list(f.pos_id38)))
    print(len(set(f.pos_id38)))

106815
104185
1240469
1218415


In [5]:
dfls = [det19, det38, not19, not38]

dropme = ['Type', 'Length', 'AnnoType','ConsScore', 'ConsDetail','oAA', 'nAA', 'GeneID',
       'FeatureID', 'GeneName', 'CCDS', 'Intron', 'Exon', 'cDNApos',
       'relcDNApos', 'CDSpos', 'relCDSpos', 'protPos', 'relProtPos', 'Domain',
       'Dst2Splice', 'Dst2SplType', 'minDistTSS', 'minDistTSE', 'SIFTcat',
       'SIFTval', 'PolyPhenCat', 'PolyPhenVal', 'Amino_acids']

for i in dfls:
    print(i.shape)
    i.drop(dropme, axis=1, inplace=True)
    print(i.shape)
    i.drop_duplicates(keep='first', inplace = True)
    print(i.shape)
    print()

Index(['pos_hg19', 'chr', 'pos_ID', 'Ref', 'Alt', 'Type', 'Length', 'AnnoType',
       'Consequence', 'ConsScore', 'ConsDetail', 'GC', 'CpG', 'motifECount',
       'motifEName', 'motifEHIPos', 'motifEScoreChng', 'oAA', 'nAA', 'GeneID',
       'FeatureID', 'GeneName', 'CCDS', 'Intron', 'Exon', 'cDNApos',
       'relcDNApos', 'CDSpos', 'relCDSpos', 'protPos', 'relProtPos', 'Domain',
       'Dst2Splice', 'Dst2SplType', 'minDistTSS', 'minDistTSE', 'SIFTcat',
       'SIFTval', 'PolyPhenCat', 'PolyPhenVal', 'priPhCons', 'mamPhCons',
       'verPhCons', 'priPhyloP', 'mamPhyloP', 'verPhyloP', 'GerpRS',
       'GerpRSpval', 'GerpN', 'GerpS', 'Grantham', 'Dist2Mutation',
       'Freq100bp', 'Rare100bp', 'Sngl100bp', 'Freq1000bp', 'Rare1000bp',
       'Sngl1000bp', 'Freq10000bp', 'Rare10000bp', 'Sngl10000bp', 'RawScore',
       'PHRED', 'pos_id19', 'Amino_acids'],
      dtype='object')

Index(['pos_hg38', 'chr', 'pos_ID', 'Ref', 'Alt', 'Type', 'Length', 'AnnoType',
       'Consequence', 'ConsScor

---
---
---


# key versions
### super format == { posid# _ AApos _ ENST }
### great format == { posid# _ AApos _ aminoacids }
### alright format == { posid# _ aminoacids }

**goal 1:1 mapping btw CADD and dbnsfp, recovering all positions in dbNSFP file too**

### column names:

**cadd files:**
- 'pos_id19' or 'pos_id38' & 'protPos' & 'FeatureID'

**dbNSFP files:**
- 'pos_id19' or 'pos_id38' & 'matched_aapos' & 'FeatureID'

In [24]:
def super_pos_id(df, posid, aapos, assembly):
    # format = posid _ aapos _ ENST
    if assembly == 37:
        df.loc[:,'super_pos_id19'] = df[posid].astype(str) + '_' + df[aapos].astype(str) + \
        '_' + df['FeatureID']
        return df
    if assembly == 38:
        df.loc[:,'super_pos_id38'] = df[posid].astype(str) + '_' + df[aapos].astype(str) + \
        '_' + df['FeatureID']
        return df

def great_pos_id(df, posid, aapos, assembly):
    # format = posid _ aapos
    if assembly == 37:
        df.loc[:,'great_pos_id19'] = df[posid].astype(str) + '_' + df[aapos].astype(str) + \
        '_' + df['Amino_acids']
        return df
    if assembly == 38:
        df.loc[:,'great_pos_id38'] = df[posid].astype(str) + '_' + df[aapos].astype(str) + \
        '_' + df['Amino_acids']
        return df

def alright_pos_id(df, posid, assembly):
    # format = posid _ aapos
    if assembly == 37:
        df.loc[:,'alright_pos_id19'] = df[posid].astype(str) + '_' + df['Amino_acids']
        return df
    if assembly == 38:
        df.loc[:,'alright_pos_id38'] = df[posid].astype(str) + '_' + df['Amino_acids']
        return df
    
def unique_df(df19, df38, dbnsfp, colname19, colname38):
    print("hg19: ")
    uniqueCount(df19, colname19)
    print("hg38: ")
    uniqueCount(df38, colname38)
    print("dbNSFP file both assemblies, 19 and 38: ")
    uniqueCount(dbnsfp, colname19)
    uniqueCount(dbnsfp, colname38)

## dbNSFP adding KEYS :

```python
dbnsfp = pd.read_csv('SCORE2_dbNSFP_selectcols_detected_CK_104475.csv', low_memory=False)
print(dbnsfp.shape)
notdbnsfp = pd.read_csv('SCORE2_dbNSFP_selectcols_NOT_detected_CK_1222911.csv', low_memory=False)
print(notdbnsfp.shape)

# SUPER DETECTED
dbnsfp = super_pos_id(dbnsfp, 'pos_id19', 'matched_aapos', 37)
dbnsfp = super_pos_id(dbnsfp, 'pos_id38', 'matched_aapos', 38)
# SUPER NOTDETECTED
notdbnsfp = super_pos_id(notdbnsfp, 'pos_id19', 'matched_aapos', 37)
notdbnsfp = super_pos_id(notdbnsfp, 'pos_id38', 'matched_aapos', 38)

# GREAT DETECTED
dbnsfp = great_pos_id(dbnsfp, 'pos_id19', 'matched_aapos', 37)
dbnsfp = great_pos_id(dbnsfp, 'pos_id38', 'matched_aapos', 38)
# GREAT NOTDETECTED
notdbnsfp = great_pos_id(notdbnsfp, 'pos_id19', 'matched_aapos', 37)
notdbnsfp = great_pos_id(notdbnsfp, 'pos_id38', 'matched_aapos', 38)

# ALRIGHT DETECTED
dbnsfp = alright_pos_id(dbnsfp, 'pos_id19', 37)
dbnsfp = alright_pos_id(dbnsfp, 'pos_id38', 38)
# ALRIGHT NOTDETECTED
notdbnsfp = alright_pos_id(notdbnsfp, 'pos_id19', 37)
notdbnsfp = alright_pos_id(notdbnsfp, 'pos_id38', 38)

dbnsfp.to_csv('SCORE2_dbNSFP_selectcols_detected_CK_104475.csv', index=False)
notdbnsfp.to_csv('SCORE2_dbNSFP_selectcols_NOT_detected_CK_1222911.csv', index=False)
```


## CADD adding keys:

### DETECTED FILES
```python
det19 = pd.read_csv('SCORE_CADD_37_detected_CK.csv', low_memory=False)
det38 = pd.read_csv('SCORE_CADD_38_detected_CK.csv',  low_memory=False)
# aa pos as int
det19.protPos = det19.protPos.astype(int)
det38.protPos = det38.protPos.astype(int)

# CADD DETECTED GREAT
det19 = great_pos_id(det19, 'pos_id19', 'protPos', 37)
det38 = great_pos_id(det38, 'pos_id38', 'protPos', 38)

# CADD ALRIGHT GREAT
det19 = alright_pos_id(det19, 'pos_id19', 37)
det38 = alright_pos_id(det38, 'pos_id38', 38)

# saving 
det19.to_csv('SCORE2_CADD_37_detected_CK.csv', index=False)
det38.to_csv('SCORE2_CADD_38_detected_CK.csv',  index=False)
```

### NOT DETECTED FILES
```python
not19 = pd.read_csv('SCORE_CADD_37_NOT_detected_CK.csv', low_memory=False)
not38 = pd.read_csv('SCORE_CADD_38_NOT_detected_CK.csv',  low_memory=False)
not19.protPos = not19.protPos.astype(int)
not38.protPos = not38.protPos.astype(int)

# CADD NOT GREAT
not19 = great_pos_id(not19, 'pos_id19', 'protPos', 37)
not38 = great_pos_id(not38, 'pos_id38', 'protPos', 38)

# CADD NOT ALRIGHT
not19 = alright_pos_id(not19, 'pos_id19',  37)
not38 = alright_pos_id(not38, 'pos_id38',  38)

# saving 
not19.to_csv('SCORE2_CADD_37_NOT_detected_CK.csv', index=False)
not38.to_csv('SCORE2_CADD_38_NOT_detected_CK.csv',  index=False)
```

In [46]:
# simple df for merge
dbnsfpkey19 = dbnsfp[['alright_pos_id19']]
dbnsfpkey38 = dbnsfp[['alright_pos_id38']]
det19key = det19[['alright_pos_id19']]
det38key = det38[['alright_pos_id38']]

#Now we can just merge both dataframes with an outer join:
det19merge = pd.merge(dbnsfpkey19, det19key, on=['alright_pos_id19'], how='inner')
print('detected 19 merge')
print(dbnsfpkey19.shape)
print(det19merge.shape)
print()
print('detected 38 merge')
det38merge = pd.merge(dbnsfpkey38, det38key, on=['alright_pos_id38'], how='inner')
print(dbnsfpkey38.shape)
print(det38merge.shape)

detected 19 merge
(104475, 1)
(105091, 1)

detected 38 merge
(104475, 1)
(105623, 1)


# GREAT KEY MERGER Resulted in 

### detected CK dbnsfp - cadd19
###104475 - 95543 = 8932

### detected CK dbnsfp - cadd38
##104475-92834 = 11641

### CADD key results (SUCCESS that all great keys are unique):
```python
unique_df(det19, det38, dbnsfp, 'great_pos_id19', 'great_pos_id38')
# all unique same for notdetected set
```

### MERGE dbNSFP + CADD on 'super_pos_id19' | 'super_pos_id38'
```python
# simple df for merge
dbnsfpkey19 = dbnsfp[['great_pos_id19']]
dbnsfpkey38 = dbnsfp[['great_pos_id38']]
det19key = det19[['great_pos_id19']]
det38key = det38[['great_pos_id38']]
#Now we can just merge both dataframes with an outer join:
det19merge = pd.merge(dbnsfpkey19, det19key, on=['great_pos_id19'], how='inner')
print('det 19 merge')
print(dbnsfpkey19.shape)
print(det19merge.shape)
print()
print('det 38 merge')
det38merge = pd.merge(dbnsfpkey38, det38key, on=['great_pos_id38'], how='inner')
print(dbnsfpkey38.shape)
print(det38merge.shape)
det 19 merge
(104475, 1)
(95543, 1)

det 38 merge
(104475, 1)
(92834, 1)
```


# SUPER KEY MERGER Resulted in 20k positions lost

### DETECTED CADD to dbNSFP merge inner on super_pos_id KEY:
    #### 25,158 less rows with det19 cadd merge to dbnsfp (79317 rows in merge)
    #### 26,131 less rows with det38 cadd merge to dbnsfp (78344 rows in merge)
    
### CADD key results (SUCCESS that all super keys are unique):
```python
unique_df(det19, det38, dbnsfp, 'super_pos_id19', 'super_pos_id38')
# output:
    hg19: 
    len of col:  106175
    len of col set:  106175
    hg38: 
    len of col:  106815
    len of col set:  106815
    dbNSFP file both assemblies, 19 and 38: 
    len of col:  104475
    len of col set:  104475
```

### MERGE dbNSFP + CADD on 'super_pos_id19' | 'super_pos_id38'
```python
# simple df for merge
dbkey19 = dbnsfp[['super_pos_id19']]
dbkey38 = dbnsfp[['super_pos_id38']]
det19key = det19[['super_pos_id19']]
det38key = det38[['super_pos_id38']]
#Now we can just merge both dataframes with an outer join:
det_19_mer = pd.merge(dbkey19, det19key, how='inner')
det_38_mer = pd.merge(dbkey38, det38key, how='inner')
print(det_19_mer)
print()
print(det_38_mer)

```

### CADD processing- check for duplicate values:
```python

# all rows with duplicated values
dup38 = pd.concat(d for _, d in det38.groupby("great_pos_id38") if len(d) > 1)
dup38
# all rows with duplicated values
dup = pd.concat(d for _, d in det19.groupby("great_pos_id19") if len(d) > 1)
dup
#dup.to_csv("DUPLICATE_great_posid_detected19_cadd_1786.csv", index=False)
```

```python
## CADD adding keys:
dropme19 = ['AnnoType','ConsScore', 'ConsDetail', 'GeneID','FeatureID', 'GeneName', 'CCDS', 'Intron', 'Exon', 'cDNApos',
       'relcDNApos', 'CDSpos', 'relCDSpos', 'relProtPos', 'Domain','Dst2Splice', 'Dst2SplType', 'minDistTSS', 'minDistTSE', 'SIFTcat',
       'SIFTval', 'PolyPhenCat', 'PolyPhenVal', 'super_pos_id19']

dropme38 = ['AnnoType','ConsScore', 'ConsDetail', 'GeneID','FeatureID', 'GeneName', 'CCDS', 'Intron', 'Exon', 'cDNApos',
       'relcDNApos', 'CDSpos', 'relCDSpos', 'relProtPos', 'Domain','Dst2Splice', 'Dst2SplType', 'minDistTSS', 'minDistTSE', 'SIFTcat',
       'SIFTval', 'PolyPhenCat', 'PolyPhenVal', 'super_pos_id38']

det19 = pd.read_csv('SCORE2_CADD_37_detected_CK.csv', low_memory=False)
det19.protPos = det19.protPos.astype(int)
det19 = great_pos_id(det19, 'pos_id19', 'protPos', 37)
det38 = pd.read_csv('SCORE2_CADD_38_detected_CK.csv',  low_memory=False)
det38.protPos = det38.protPos.astype(int)
det38 = great_pos_id(det38, 'pos_id38', 'protPos', 38)

dfls = [det19]
for i in dfls:
    i.drop(dropme19, axis = 1, inplace=True) 
    print(i.shape)
    # dropping ALL duplicte values 
    i.drop_duplicates(keep='first', inplace = True)
    print(i.shape)
    print()
dfls = [det38]
for i in dfls:
    i.drop(dropme38, axis = 1, inplace=True) 
    print(i.shape)
    # dropping ALL duplicte values 
    i.drop_duplicates(keep='first', inplace = True)
    print(i.shape)
    print()

### NOT DETECTED FILES
not19 = pd.read_csv('SCORE2_CADD_37_NOT_detected_CK.csv', low_memory=False)
not19.protPos = not19.protPos.astype(int)
not19 = great_pos_id(not19, 'pos_id19', 'protPos', 37)
not38 = pd.read_csv('SCORE2_CADD_38_NOT_detected_CK.csv',  low_memory=False)
not38.protPos = not38.protPos.astype(int)
not38 = great_pos_id(not38, 'pos_id38', 'protPos', 38)

dfls = [not19]
for i in dfls:
    i.drop(dropme19, axis = 1, inplace=True) 
    print(i.shape)
    # dropping ALL duplicte values 
    i.drop_duplicates(keep='first', inplace = True)
    print(i.shape)
    print()
dfls = [not38]
for i in dfls:
    i.drop(dropme38, axis = 1, inplace=True) 
    print(i.shape)
    # dropping ALL duplicte values 
    i.drop_duplicates(keep='first', inplace = True)
    print(i.shape)
    print()

# SAVING ALL FILES
det19.to_csv('SCORE2_CADD_37_detected_CK.csv', index=False)
det38.to_csv('SCORE2_CADD_38_detected_CK.csv',  index=False)
not19.to_csv('SCORE2_CADD_37_NOT_detected_CK.csv', index=False)
not38.to_csv('SCORE2_CADD_38_NOT_detected_CK.csv',  index=False)
```

## unique great_pos_id counts post dropping 'dropme' columns and dropping duplicate rows

|  | dbNSFP count | CADD19 | CADD38 |
|--|---------------| -------| -----|
| DETECTED | 104475 | 105250 | 105793 |
| NOT DECT | 1222911 | 1226678 | 1231490 |

- all CADD rows greater than dbNSFP row count
- all df's from CADD and dbNSFP have unique great_pos_id, ready to merge!

# (done) CADD file correcting pos_id # format (add leading 0's)
- 9 digits with leading zeros for pos_id19 or pos_id38

```python 
def create_pos_id19(df, chrr, pos, ref, alt):
    # variables are colnames of df
    df.loc[:,'pos_id19'] = df[chrr].astype(str) + '_' + \
    df[pos].astype(str) + '_' + df[ref].astype(str) + \
    '_' + df[alt].astype(str)
    return df

def create_pos_id38(df, chrr, pos, ref, alt):
    # variables are colnames of df
    df.loc[:,'pos_id38'] = df[chrr].astype(str) + '_' + \
    df[pos].astype(str) + '_' + df[ref].astype(str) + \
    '_' + df[alt].astype(str)
    return df

# dbNSFP FILES
dbnsfp = pd.read_csv('SCORE_dbNSFP_selectcols_detected_CK_104475.csv', low_memory=False)
notdbnsfp = pd.read_csv('SCORE_dbNSFP_selectcols_NOT_detected_CK_1222911.csv', low_memory=False)

# CADD FILES
det19 = pd.read_csv('SCORE_CADD_37_detected_CK.csv', low_memory=False, converters={'pos_hg19': '{:0>9}'.format})
det38 = pd.read_csv('SCORE_CADD_38_detected_CK.csv', low_memory=False, converters={'pos_hg38': '{:0>9}'.format})
not19 = pd.read_csv('SCORE_CADD_37_NOT_detected_CK.csv', low_memory=False, converters={'pos_hg19': '{:0>9}'.format})
not38 = pd.read_csv('SCORE_CADD_38_NOT_detected_CK.csv', low_memory=False, converters={'pos_hg38': '{:0>9}'.format})
```

## calling function to create pos id with correct # format:
```python
# cadd detected files
det19 = create_pos_id19(det19, 'chr', 'pos_hg19', 'Ref', 'Alt')
det38 = create_pos_id38(det38, 'chr', 'pos_hg38', 'Ref', 'Alt')

# cadd NOTdetected files
not19 = create_pos_id19(not19, 'chr', 'pos_hg19', 'Ref', 'Alt')
not38 = create_pos_id38(not38, 'chr', 'pos_hg38', 'Ref', 'Alt')

# saving files with correct pos_id
det19.to_csv('SCORE_CADD_37_detected_CK.csv', index=False)
det38.to_csv('SCORE_CADD_38_detected_CK.csv',  index=False)
not19.to_csv('SCORE_CADD_37_NOT_detected_CK.csv', index=False)
not38.to_csv('SCORE_CADD_38_NOT_detected_CK.csv',  index=False)
```

---
---
---


---

# (done) pt.5  1st making ENST column for dbNSFP files, 
### 'FeatureID' col already in rows of CADD file and may help make super_posid key a 1:1 mapping


```python
# header col index positions
header = dbnsfp.columns
header2 = notdbnsfp.columns
d = {header[i] : i for i in range(0, len(header))}
# print(d) # 'matched_index': 7, 'Ensembl_transcriptid': 8,
d2 = {header2[i] : i for i in range(0, len(header2))}
# print(d2) # 'matched_index': 5, 'Ensembl_transcriptid': 6, 


def feature_id_col(filename, outfile, subset):
    if subset == 'dect':
        # adding feature_ID column from parsing enst list col from dbNSFP with matched_index col
        # modeled after function from Pmap_parseID_correction.py
        with open(filename, newline='') as file:
            # read in file, save header
            csvReader = csv.reader(file)
            header = next(csvReader)
            # add new col
            header.append('FeatureID')
            # create and write to outfile
            os.system("touch %s" % (outfile))
            with open(outfile, 'w') as out:
                csvWriter = csv.writer(out)
                csvWriter.writerow(header)
            # loop over rows
            for row in csvReader:
                # create list
                matchI = int(row[7])
                ENST = row[8]
                lsID = ENST.split(";")
                featureid = lsID[matchI]
                row.append(featureid)
                with open(outfile, 'a') as out:
                        csvWriter = csv.writer(out)
                        csvWriter.writerow(row)
        print("done with : ", outfile)
    if subset == 'notdect':
        # adding feature_ID column from parsing enst list col from dbNSFP with matched_index col
        # modeled after function from Pmap_parseID_correction.py
        with open(filename, newline='') as file:
            # read in file, save header
            csvReader = csv.reader(file)
            header = next(csvReader)
            # add new col
            header.append('FeatureID')
            # create and write to outfile
            os.system("touch %s" % (outfile))
            with open(outfile, 'w') as out:
                csvWriter = csv.writer(out)
                csvWriter.writerow(header)
            # loop over rows
            for row in csvReader:
                # create list
                matchI = int(row[5])
                ENST = row[6]
                lsID = ENST.split(";")
                featureid = lsID[matchI]
                row.append(featureid)
                with open(outfile, 'a') as out:
                        csvWriter = csv.writer(out)
                        csvWriter.writerow(row)
        print("done with : ", outfile)
        
        
# calling function
feature_id_col('SCORE_dbNSFP_selectcols_detected_CK_104475.csv', 'SCORE2_dbNSFP_selectcols_detected_CK_104475.csv', 'dect')
feature_id_col('SCORE_dbNSFP_selectcols_NOT_detected_CK_1222911.csv', 'SCORE2_dbNSFP_selectcols_NOT_detected_CK_1222911.csv', 'notdect')
```

## result of feature_id_col() =
- all dbnsfp file rows have 1 ENST ID matching index of canonical ukb ID

description files for new featureID column added created with below code:
```python
saveColumnValues(dbnsfp, 'FeatureID', 'dbnsfp_featureID_detected_description.csv')
saveColumnValues(notdbnsfp, 'FeatureID', 'dbnsfp_featureID_notdetected_description.csv')
```

---
---
---
---


# [4] simplifying dbNSFP score files by COLUMNS OF INTEREST

rules: 
1. merge on pos_idcoordinates
2. match on Amino_acids 'A/A'
3. protein positions match- not required since most annotations refer to coordinates for annotation

In [47]:
os.chdir("/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/CADDmapped/BUG_FIX/")

In [49]:
##  detected dbNSFP score file: 
det_scores = 'SCORE_annotation_3840_CYS_LYS_detected.csv'
detscore = pd.read_csv(det_scores, low_memory=False)
print(detscore.shape)
# (104475, 400)

### simplify df with select columns: 
det_cols = ['pos_id19', 'pos_id38_x', 'aaref_x', 'aaalt_x', 'matched_aapos_x',
       'CADD_phred_hg38', 'matched_UKBID_x', 'pos_dict', 'matched_target',
       'CADD_phred_hg19', 'CADDdiff_38minus19', 'Amino_acids', 'pos_ID',
       'Cys_reactivity', 'Cys_react_threshold', 'Cys_target_label',
       'Lys_reactivity', 'Lys_react_threshold', 'Lys_target_label', 'rs_dbSNP151', 'genename','Ensembl_transcriptid','cds_strand', 'refcodon', 'codonpos', 'codon_degeneracy','LRT_score', 'LRT_converted_rankscore',
       'LRT_pred', 'LRT_Omega', 'MutationTaster_score',
       'FATHMM_score','FATHMM_converted_rankscore', 'FATHMM_pred', 'PROVEAN_score','PROVEAN_converted_rankscore', 'PROVEAN_pred', 'VEST4_score','VEST4_rankscore', 'MetaSVM_score', 'MetaSVM_rankscore', 'MetaSVM_pred','MetaLR_score', 'MetaLR_rankscore', 'MetaLR_pred', 'Reliability_index','M-CAP_score', 'M-CAP_rankscore', 'M-CAP_pred', 'REVEL_score','REVEL_rankscore', 'MutPred_score', 'MutPred_rankscore',
       'MutPred_protID', 'MutPred_AAchange', 'MutPred_Top5features',
        'MPC_score','MPC_rankscore','PrimateAI_score', 'PrimateAI_rankscore', 'PrimateAI_pred',
       'DANN_score', 'DANN_rankscore', 'fathmm-MKL_coding_score',
       'fathmm-MKL_coding_rankscore', 'fathmm-MKL_coding_pred',
       'fathmm-MKL_coding_group', 'fathmm-XF_coding_score',
       'fathmm-XF_coding_rankscore', 'fathmm-XF_coding_pred',
       'Eigen-raw_coding', 'Eigen-raw_coding_rankscore', 'Eigen-pred_coding',
       'Eigen-PC-raw_coding', 'Eigen-PC-raw_coding_rankscore',
       'Eigen-PC-phred_coding', 'matched_index']
# filter
simpleD = detscore[det_cols].copy()
rename = ['pos_id19', 'pos_id38', 'aaref', 'aaalt', 'matched_aapos',
       'CADD_phred_hg38', 'matched_UKBID', 'pos_dict', 'matched_target',
       'CADD_phred_hg19', 'CADDdiff_38minus19', 'Amino_acids', 'pos_ID',
       'Cys_reactivity', 'Cys_react_threshold', 'Cys_target_label',
       'Lys_reactivity', 'Lys_react_threshold', 'Lys_target_label', 'rs_dbSNP151', 'genename','Ensembl_transcriptid', 'cds_strand', 'refcodon', 'codonpos', 'codon_degeneracy','LRT_score', 'LRT_converted_rankscore',
       'LRT_pred', 'LRT_Omega', 'MutationTaster_score',
       'FATHMM_score','FATHMM_converted_rankscore', 'FATHMM_pred', 'PROVEAN_score','PROVEAN_converted_rankscore', 'PROVEAN_pred', 'VEST4_score','VEST4_rankscore', 'MetaSVM_score', 'MetaSVM_rankscore', 'MetaSVM_pred','MetaLR_score', 'MetaLR_rankscore', 'MetaLR_pred', 'Reliability_index','M-CAP_score', 'M-CAP_rankscore', 'M-CAP_pred', 'REVEL_score','REVEL_rankscore', 'MutPred_score', 'MutPred_rankscore',
       'MutPred_protID', 'MutPred_AAchange', 'MutPred_Top5features',
       'MPC_score','MPC_rankscore','PrimateAI_score', 'PrimateAI_rankscore', 'PrimateAI_pred',
       'DANN_score', 'DANN_rankscore', 'fathmm-MKL_coding_score',
       'fathmm-MKL_coding_rankscore', 'fathmm-MKL_coding_pred',
       'fathmm-MKL_coding_group', 'fathmm-XF_coding_score',
       'fathmm-XF_coding_rankscore', 'fathmm-XF_coding_pred',
       'Eigen-raw_coding', 'Eigen-raw_coding_rankscore', 'Eigen-pred_coding',
       'Eigen-PC-raw_coding', 'Eigen-PC-raw_coding_rankscore',
       'Eigen-PC-phred_coding', 'matched_index']
simpleD.columns = rename
# change order
neworder = ['pos_id19', 'pos_id38', 'pos_ID', 'matched_UKBID', 'pos_dict', 'matched_target', 'matched_aapos', 'matched_index', 'Ensembl_transcriptid',
        'Amino_acids', 'aaref', 'aaalt', 'CADD_phred_hg38',  'CADD_phred_hg19', 'CADDdiff_38minus19', 
       'Cys_reactivity', 'Cys_react_threshold', 'Cys_target_label',
       'Lys_reactivity', 'Lys_react_threshold', 'Lys_target_label', 
       'rs_dbSNP151', 'genename','cds_strand', 'refcodon', 'codonpos', 'codon_degeneracy','LRT_score', 'LRT_converted_rankscore',
       'LRT_pred', 'LRT_Omega', 'MutationTaster_score',
       'FATHMM_score','FATHMM_converted_rankscore', 'FATHMM_pred', 'PROVEAN_score','PROVEAN_converted_rankscore', 'PROVEAN_pred', 'VEST4_score','VEST4_rankscore', 'MetaSVM_score', 'MetaSVM_rankscore', 'MetaSVM_pred','MetaLR_score', 'MetaLR_rankscore', 'MetaLR_pred', 'Reliability_index','M-CAP_score', 'M-CAP_rankscore', 'M-CAP_pred', 'REVEL_score','REVEL_rankscore', 'MutPred_score', 'MutPred_rankscore',
       'MutPred_protID', 'MutPred_AAchange', 'MutPred_Top5features',
       'MPC_score','MPC_rankscore', 'PrimateAI_score', 'PrimateAI_rankscore', 'PrimateAI_pred',
       'DANN_score', 'DANN_rankscore', 'fathmm-MKL_coding_score',
       'fathmm-MKL_coding_rankscore', 'fathmm-MKL_coding_pred',
       'fathmm-MKL_coding_group', 'fathmm-XF_coding_score',
       'fathmm-XF_coding_rankscore', 'fathmm-XF_coding_pred',
       'Eigen-raw_coding', 'Eigen-raw_coding_rankscore', 'Eigen-pred_coding',
       'Eigen-PC-raw_coding', 'Eigen-PC-raw_coding_rankscore',
       'Eigen-PC-phred_coding']
simpleD = simpleD[neworder]

(104475, 400)


In [51]:
print(simpleD.shape)
simpleD.to_csv("SCORE_dbNSFP_selectcols_detected_CK_104475.csv", index=False)

(104475, 77)


In [52]:
## not detected dbNSFP score file: 
not_scores = 'SCORE_annotation_3840_CYS_LYS_NOT_allcol_detected.csv'
notscore = pd.read_csv(not_scores, low_memory=False)
print(notscore.shape)
# (1222911, 385)

### simplify df with select columns: 
not_cols = ['pos_id19', 'CADD_phred_hg19', 'CADDdiff_38minus19', 'Amino_acids',
       'pos_ID_falseCKtarget', 'aaref',
       'aaalt', 'rs_dbSNP151', 'genename', 'Ensembl_transcriptid','cds_strand', 'refcodon', 'codonpos','codon_degeneracy', 'LRT_score',
       'LRT_converted_rankscore', 'LRT_pred', 'LRT_Omega',
       'MutationTaster_score', 'MutationTaster_converted_rankscore',
       'MutationTaster_pred', 'MutationTaster_model', 'MutationTaster_AAE',
       'MutationAssessor_score', 'MutationAssessor_rankscore',
       'MutationAssessor_pred', 'FATHMM_score', 'FATHMM_converted_rankscore',
       'FATHMM_pred', 'PROVEAN_score', 'PROVEAN_converted_rankscore',
       'PROVEAN_pred', 'VEST4_score', 'VEST4_rankscore', 'MetaSVM_score',
       'MetaSVM_rankscore', 'MetaSVM_pred', 'MetaLR_score', 'MetaLR_rankscore',
       'MetaLR_pred', 'Reliability_index', 'M-CAP_score', 'M-CAP_rankscore',
       'M-CAP_pred', 'REVEL_score', 'REVEL_rankscore', 'MutPred_score',
       'MutPred_rankscore', 'MutPred_protID', 'MutPred_AAchange',
       'MutPred_Top5features', 'MPC_score','MPC_rankscore', 'PrimateAI_score', 'PrimateAI_rankscore',
       'PrimateAI_pred','CADD_phred','DANN_score', 'DANN_rankscore', 'fathmm-MKL_coding_score',
       'fathmm-MKL_coding_rankscore', 'fathmm-MKL_coding_pred',
       'fathmm-MKL_coding_group', 'fathmm-XF_coding_score',
       'fathmm-XF_coding_rankscore', 'fathmm-XF_coding_pred',
       'Eigen-raw_coding', 'Eigen-raw_coding_rankscore', 'Eigen-pred_coding',
       'Eigen-PC-raw_coding', 'Eigen-PC-raw_coding_rankscore',
       'Eigen-PC-phred_coding', 'matched_UKBID',
       'matched_aapos', 'matched_index', 'pos_id38']
# filter
simpleN = notscore[not_cols].copy()
# rename
simpleN.columns = ['pos_id19', 'CADD_phred_hg19', 'CADDdiff_38minus19', 'Amino_acids',
       'pos_ID', 'aaref',
       'aaalt', 'rs_dbSNP151', 'genename', 'Ensembl_transcriptid','cds_strand', 'refcodon', 'codonpos','codon_degeneracy', 'LRT_score',
       'LRT_converted_rankscore', 'LRT_pred', 'LRT_Omega',
       'MutationTaster_score', 'MutationTaster_converted_rankscore',
       'MutationTaster_pred', 'MutationTaster_model', 'MutationTaster_AAE',
       'MutationAssessor_score', 'MutationAssessor_rankscore',
       'MutationAssessor_pred', 'FATHMM_score', 'FATHMM_converted_rankscore',
       'FATHMM_pred', 'PROVEAN_score', 'PROVEAN_converted_rankscore',
       'PROVEAN_pred', 'VEST4_score', 'VEST4_rankscore', 'MetaSVM_score',
       'MetaSVM_rankscore', 'MetaSVM_pred', 'MetaLR_score', 'MetaLR_rankscore',
       'MetaLR_pred', 'Reliability_index', 'M-CAP_score', 'M-CAP_rankscore',
       'M-CAP_pred', 'REVEL_score', 'REVEL_rankscore', 'MutPred_score',
       'MutPred_rankscore', 'MutPred_protID', 'MutPred_AAchange',
       'MutPred_Top5features', 'MPC_score','MPC_rankscore', 'PrimateAI_score', 'PrimateAI_rankscore',
       'PrimateAI_pred','CADD_phred_hg38','DANN_score', 'DANN_rankscore', 'fathmm-MKL_coding_score',
       'fathmm-MKL_coding_rankscore', 'fathmm-MKL_coding_pred',
       'fathmm-MKL_coding_group', 'fathmm-XF_coding_score',
       'fathmm-XF_coding_rankscore', 'fathmm-XF_coding_pred',
       'Eigen-raw_coding', 'Eigen-raw_coding_rankscore', 'Eigen-pred_coding',
       'Eigen-PC-raw_coding', 'Eigen-PC-raw_coding_rankscore',
       'Eigen-PC-phred_coding', 'matched_UKBID',
       'matched_aapos', 'matched_index', 'pos_id38']
# change order
neworder = ['pos_id19', 'pos_id38', 'pos_ID', 'matched_UKBID', 'matched_aapos', 'matched_index','Ensembl_transcriptid', 'Amino_acids','aaref','aaalt', 
'CADD_phred_hg38', 'CADD_phred_hg19', 'CADDdiff_38minus19', 'rs_dbSNP151', 'genename', 'cds_strand', 'refcodon', 'codonpos','codon_degeneracy', 'LRT_score',
       'LRT_converted_rankscore', 'LRT_pred', 'LRT_Omega',
       'MutationTaster_score', 'MutationTaster_converted_rankscore',
       'MutationTaster_pred', 'MutationTaster_model', 'MutationTaster_AAE',
       'MutationAssessor_score', 'MutationAssessor_rankscore',
       'MutationAssessor_pred', 'FATHMM_score', 'FATHMM_converted_rankscore',
       'FATHMM_pred', 'PROVEAN_score', 'PROVEAN_converted_rankscore',
       'PROVEAN_pred', 'VEST4_score', 'VEST4_rankscore', 'MetaSVM_score',
       'MetaSVM_rankscore', 'MetaSVM_pred', 'MetaLR_score', 'MetaLR_rankscore',
       'MetaLR_pred', 'Reliability_index', 'M-CAP_score', 'M-CAP_rankscore',
       'M-CAP_pred', 'REVEL_score', 'REVEL_rankscore', 'MutPred_score',
       'MutPred_rankscore', 'MutPred_protID', 'MutPred_AAchange',
       'MutPred_Top5features', 'MPC_score','MPC_rankscore', 'PrimateAI_score', 'PrimateAI_rankscore',
       'PrimateAI_pred','DANN_score', 'DANN_rankscore', 'fathmm-MKL_coding_score',
       'fathmm-MKL_coding_rankscore', 'fathmm-MKL_coding_pred',
       'fathmm-MKL_coding_group', 'fathmm-XF_coding_score',
       'fathmm-XF_coding_rankscore', 'fathmm-XF_coding_pred',
       'Eigen-raw_coding', 'Eigen-raw_coding_rankscore', 'Eigen-pred_coding',
       'Eigen-PC-raw_coding', 'Eigen-PC-raw_coding_rankscore',
       'Eigen-PC-phred_coding']
simpleN = simpleN[neworder]

(1222911, 385)


In [57]:
simpleN.to_csv("SCORE_dbNSFP_selectcols_NOT_detected_CK_1222911.csv", index=False)
# shape (1222911, 75)

In [55]:
simpleN.head(3)

Unnamed: 0,pos_id19,pos_id38,pos_ID,matched_UKBID,matched_aapos,matched_index,Ensembl_transcriptid,Amino_acids,aaref,aaalt,CADD_phred_hg38,CADD_phred_hg19,CADDdiff_38minus19,rs_dbSNP151,genename,cds_strand,refcodon,codonpos,codon_degeneracy,LRT_score,LRT_converted_rankscore,LRT_pred,LRT_Omega,MutationTaster_score,MutationTaster_converted_rankscore,MutationTaster_pred,MutationTaster_model,MutationTaster_AAE,MutationAssessor_score,MutationAssessor_rankscore,MutationAssessor_pred,FATHMM_score,FATHMM_converted_rankscore,FATHMM_pred,PROVEAN_score,PROVEAN_converted_rankscore,PROVEAN_pred,VEST4_score,VEST4_rankscore,MetaSVM_score,MetaSVM_rankscore,MetaSVM_pred,MetaLR_score,MetaLR_rankscore,MetaLR_pred,Reliability_index,M-CAP_score,M-CAP_rankscore,M-CAP_pred,REVEL_score,REVEL_rankscore,MutPred_score,MutPred_rankscore,MutPred_protID,MutPred_AAchange,MutPred_Top5features,MPC_score,MPC_rankscore,PrimateAI_score,PrimateAI_rankscore,PrimateAI_pred,DANN_score,DANN_rankscore,fathmm-MKL_coding_score,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_pred,fathmm-MKL_coding_group,fathmm-XF_coding_score,fathmm-XF_coding_rankscore,fathmm-XF_coding_pred,Eigen-raw_coding,Eigen-raw_coding_rankscore,Eigen-pred_coding,Eigen-PC-raw_coding,Eigen-PC-raw_coding_rankscore,Eigen-PC-phred_coding
0,10_000093156_C_A,10_000047216_C_A,Q3ZCM7_K392,Q3ZCM7,392,2,ENST00000564130;ENST00000568866;ENST00000568584,Lys/Asn,K,N,23.0,22.9,0.1,.,TUBB8,-,AAG,3,2,1e-05,0.62929,U,0.0,0.654618;0.654618;0.654618;0.654618;1,0.81001,D;D;D;D;D,simple_aae;simple_aae;simple_aae;simple_aae;wi...,K358N;K355N;K392N;K320N;.,.;.;.,.,.;.;.,-1.98;-1.98;-1.98,0.85247,D;D;D,-2.77;-2.55;-2.6,0.58733,D;D;D,0.579;0.574;0.53,0.6007100000000001,0.3991,0.8908299999999999,D,0.7111,0.90075,D,9,0.009489,0.24855,T,0.491,0.78035,0.718,0.85313,Q3ZCM7,K392N,Loss of methylation at K392 (P = 0.003); Loss ...,.;.;.,.,0.751451253891,0.7466,T,0.984728,0.41873,0.38681,0.26043,N,AEFBI,0.295876,0.40587,N,0.0676056746461709,0.44955,2.76036,-0.207483129432051,0.31266,1.767854
1,10_000093156_C_G,10_000047216_C_G,Q3ZCM7_K392,Q3ZCM7,392,2,ENST00000564130;ENST00000568866;ENST00000568584,Lys/Asn,K,N,23.1,22.8,0.3,.,TUBB8,-,AAG,3,2,1e-05,0.62929,U,0.0,0.654618;0.654618;0.654618;0.654618;1,0.81001,D;D;D;D;D,simple_aae;simple_aae;simple_aae;simple_aae;wi...,K358N;K355N;K392N;K320N;.,.;.;.,.,.;.;.,-1.98;-1.98;-1.98,0.85247,D;D;D,-2.77;-2.55;-2.6,0.58733,D;D;D,0.579;0.574;0.53,0.6007100000000001,0.3991,0.8908299999999999,D,0.7111,0.90075,D,9,0.008795,0.23196,T,0.491,0.78035,0.718,0.85313,Q3ZCM7,K392N,Loss of methylation at K392 (P = 0.003); Loss ...,.;.;.,.,0.751451253891,0.7466,T,0.984663,0.41803,0.40406,0.26429,N,AEFBI,0.302556,0.41077,N,0.0676056746461709,0.44955,2.76036,-0.207483129432051,0.31266,1.767854
2,10_000093157_T_A,10_000047217_T_A,Q3ZCM7_K392,Q3ZCM7,392,2,ENST00000564130;ENST00000568866;ENST00000568584,Lys/Met,K,M,23.2,23.2,0.0,.,TUBB8,-,AAG,2,0,1e-05,0.62929,U,0.0,0.820833;0.820833;0.820833;0.820833;1,0.81001,D;D;D;D;D,simple_aae;simple_aae;simple_aae;simple_aae;wi...,K358M;K355M;K392M;K320M;.,.;.;.,.,.;.;.,-2.08;-2.08;-2.08,0.8601,D;D;D,-3.36;-3.08;-3.14,0.66549,D;D;D,0.601;0.607;0.568,0.62442,0.5527,0.9127,D,0.8388,0.9461,D,9,0.01492,0.35345,T,0.505,0.78906,0.698,0.83491,Q3ZCM7,K392M,Loss of methylation at K392 (P = 0.003); Loss ...,.;.;.,.,0.727143764496,0.71088,T,0.964042,0.29699,0.52831,0.29225,D,AEFBI,0.364324,0.45291,N,0.3107623869878599,0.56701,3.835221,-0.0235518622390073,0.38654,2.280427


In [56]:
simpleD.head(3)

Unnamed: 0,pos_id19,pos_id38,pos_ID,matched_UKBID,pos_dict,matched_target,matched_aapos,matched_index,Ensembl_transcriptid,Amino_acids,aaref,aaalt,CADD_phred_hg38,CADD_phred_hg19,CADDdiff_38minus19,Cys_reactivity,Cys_react_threshold,Cys_target_label,Lys_reactivity,Lys_react_threshold,Lys_target_label,rs_dbSNP151,genename,cds_strand,refcodon,codonpos,codon_degeneracy,LRT_score,LRT_converted_rankscore,LRT_pred,LRT_Omega,MutationTaster_score,FATHMM_score,FATHMM_converted_rankscore,FATHMM_pred,PROVEAN_score,PROVEAN_converted_rankscore,PROVEAN_pred,VEST4_score,VEST4_rankscore,MetaSVM_score,MetaSVM_rankscore,MetaSVM_pred,MetaLR_score,MetaLR_rankscore,MetaLR_pred,Reliability_index,M-CAP_score,M-CAP_rankscore,M-CAP_pred,REVEL_score,REVEL_rankscore,MutPred_score,MutPred_rankscore,MutPred_protID,MutPred_AAchange,MutPred_Top5features,MPC_score,MPC_rankscore,PrimateAI_score,PrimateAI_rankscore,PrimateAI_pred,DANN_score,DANN_rankscore,fathmm-MKL_coding_score,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_pred,fathmm-MKL_coding_group,fathmm-XF_coding_score,fathmm-XF_coding_rankscore,fathmm-XF_coding_pred,Eigen-raw_coding,Eigen-raw_coding_rankscore,Eigen-pred_coding,Eigen-PC-raw_coding,Eigen-PC-raw_coding_rankscore,Eigen-PC-phred_coding
0,10_000093270_A_C,10_000047330_A_C,Q3ZCM7_C354,Q3ZCM7,"{12: 'C', 239: 'C', 303: 'C', 354: 'C'}",C354,354,2,ENST00000564130;ENST00000568866;ENST00000568584,Cys/Trp,C,W,22.6,22.3,0.3,5.37,Low,,,,,.,TUBB8,-,TGT,3,2,3e-06,0.62929,U,0.0,0.999882;0.999882;0.999882;0.999882;1,-1.98;-1.98;-1.98,0.85247,D;D;D,-7.21;-6.69;-6.84,0.94249,D;D;D,0.753;0.756;0.751,0.75466,0.3131,0.87764,D,0.818,0.93873,D,9,0.020603,0.43227,T,0.437,0.74446,0.868,0.96212,Q3ZCM7,C354W,Gain of MoRF binding (P = 0.0706); Loss of she...,.;.;.,.,0.76919400692,0.77304,T,0.851523,0.15717,0.19276,0.20597,N,AEFBI,0.422079,0.48823,N,0.0406181179914726,0.43713,2.658815,-0.363070660159128,0.26108,1.439964
1,10_000093271_C_A,10_000047331_C_A,Q3ZCM7_C354,Q3ZCM7,"{12: 'C', 239: 'C', 303: 'C', 354: 'C'}",C354,354,2,ENST00000564130;ENST00000568866;ENST00000568584,Cys/Phe,C,F,22.6,21.8,0.8,5.37,Low,,,,,.,TUBB8,-,TGT,2,0,3e-06,0.62929,U,0.0,0.986821;0.986821;0.986821;0.986821;1,-1.95;-1.95;-1.95,0.85003,D;D;D,-7.2;-6.67;-6.83,0.94223,D;D;D,0.779;0.782;0.782,0.77883,0.4271,0.895,D,0.7835,0.92649,D,9,0.015962,0.36975,T,0.374,0.69594,0.808,0.92509,Q3ZCM7,C354F,Gain of sheet (P = 0.0827); Loss of methylatio...,.;.;.,.,0.727896571159,0.71199,T,0.895918,0.18929,0.57914,0.30502,D,AEFBI,0.537556,0.55531,D,0.0988420731411684,0.46409,2.88164,-0.263815814260368,0.293,1.640415
2,10_000093271_C_G,10_000047331_C_G,Q3ZCM7_C354,Q3ZCM7,"{12: 'C', 239: 'C', 303: 'C', 354: 'C'}",C354,354,2,ENST00000564130;ENST00000568866;ENST00000568584,Cys/Ser,C,S,22.4,19.67,2.73,5.37,Low,,,,,.,TUBB8,-,TGT,2,0,3e-06,0.62929,U,0.0,0.930492;0.930492;0.930492;0.930492;1,-1.87;-1.87;-1.87,0.84415,D;D;D,-6.55;-6.06;-6.21,0.91827,D;D;D,0.74;0.748;0.739,0.74735,0.4621,0.90007,D,0.7769,0.92419,D,9,0.022348,0.4523,T,0.377,0.69846,0.781,0.90537,Q3ZCM7,C354S,Gain of disorder (P = 0.0216); Gain of loop (P...,.;.;.,.,0.713165700436,0.69052,T,0.799045,0.12954,0.5948,0.30927,D,AEFBI,0.537556,0.55531,D,0.0966724300357427,0.46307,2.873107,-0.265973651554631,0.29227,1.635795


# [3] filter CADDv1.4 files with dbNSFP CK positions for MISSENSE consequence only

dbNSFP SCORE file column names (will map to these files using columns shared with CADD files:

**columns detected file:**
    - Amino_acids
    - pos_ID 'Q3ZCM7_C354'
    - pos_id19 or pos_id38_x

**columns not detected file:**
    - pos_id19 or pos_id38
    - Amino_acids
    - pos_ID_falseCKtarget


```python
# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# Pmap M local ipynb code, Pmap_missense_annotations_QC.py
# markdown M local QC of positions from dbNSFP overlapped with CADD37 or 38 annotations

import os
import sys
import pandas as pd


def create_coordinate_id(df, chrr, pos, ref, alt, assembly):
    if assembly == 37:
        df.loc[:,'pos_id19'] = df[chrr].astype(str) + '_' + \
                df[pos].astype(str) + '_' + df[ref].astype(str) + \
                '_' + df[alt].astype(str)
    if assembly == 38:
        df.loc[:,'pos_id38'] = df[chrr].astype(str) + '_' + \
                df[pos].astype(str) + '_' + df[ref].astype(str) + \
                '_' + df[alt].astype(str)
    return df


def format_missense_triple(df, oaacol, naacol):
    #  A|A turns to Ala/Ala
    amino_dict = dict([('A', 'Ala'),('G', 'Gly'), ('I','Ile'), ('L','Leu'), ('P', 'Pro'), ('V','Val'), ('F','Phe'),('W', 'Trp'), ('Y', 'Tyr'), ('D','Asp'),('E','Glu'), ('R','Arg'),('H','His'), ('K','Lys'), ('S','Ser'), ('T', 'Thr'), ('C', 'Cys'), ('M', 'Met'), ('N', 'Asn'), ('Q','Gln')])
    df[oaacol].replace(amino_dict, inplace=True)
    df[naacol].replace(amino_dict, inplace=True)
    ccopy = df[naacol].copy()
    df['Amino_acids'] = df[oaacol].str.cat(ccopy, sep='/')
    return df


def filter_cadd_overlap(df, assembly):
    # [1] filter for missense only
    miss = df[df['Consequence'] == 'NON_SYNONYMOUS'].copy()
    # [2] new pos_id(assembly) to files
    if assembly == 37:
        miss = create_coordinate_id(miss, 'chr', 'pos_hg19', 'Ref', 'Alt', assembly)
    if assembly == 38:
        miss = create_coordinate_id(miss, 'chr', 'pos_hg38', 'Ref', 'Alt', assembly)
    # [3] new missense type column in 3 letter format with '/' sep {oAA, nAA}
    miss = format_missense_triple(miss, 'oAA', 'nAA')

    return miss

    # [4] concat all files from 37: DECT or NOT ... 38: DECT or NOT


def main():
    os.chdir('/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/CADDmapped/RESULT_pos_overlap_dbNSFPcoordinates')

    chrls = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,'X','Y']

    # GRCh37
    for order in chrls:
        chrID = 'chr{}'.format(order)
        file1 = '{}_CADD_GRCh37_DETECTED_CK.csv'.format(chrID)
        out1 = 'MISSENSE_{}_CADD_GRCh37_DETECTED_CK.csv'.format(chrID)
        file2 = '{}_CADD_GRCh37_NOT_DETECTED_CK.csv'.format(chrID)
        out2 = 'MISSENSE_{}_CADD_GRCh37_NOT_DETECTED_CK.csv'.format(chrID)
        df1 = pd.read_csv(file1, low_memory=False)
        df2 = pd.read_csv(file2, low_memory=False)
        df1out = filter_cadd_overlap(df1, 37)
        df2out = filter_cadd_overlap(df2, 37)
        print("saving detected and not detected GRCh37 ", chrID)
        print()
        df1out.to_csv(out1, index=False)
        df2out.to_csv(out2, index=False)

    # GRCh38
    for order in chrls:
        chrID = 'chr{}'.format(order)
        file1 = '{}_CADD_GRCh38_DETECTED_CK.csv'.format(chrID)
        out1 = 'MISSENSE_{}_CADD_GRCh38_DETECTED_CK.csv'.format(chrID)
        file2 = '{}_CADD_GRCh38_NOT_DETECTED_CK.csv'.format(chrID)
        out2 = 'MISSENSE_{}_CADD_GRCh38_NOT_DETECTED_CK.csv'.format(chrID)
        df1 = pd.read_csv(file1)
        df2 = pd.read_csv(file2)
        df1out = filter_cadd_overlap(df1, 38)
        df2out = filter_cadd_overlap(df2, 38)
        print("saving detected and not detected GRCh38 ", chrID)
        print()
        df1out.to_csv(out1, index=False)
        df2out.to_csv(out2, index=False)

main()

```

# RE running GRCh38 sex chrs after fixing cadd source files on hoffman

```python
# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# Pmap M local ipynb code, Pmap_missense_annotations_QC.py
# markdown M local QC of positions from dbNSFP overlapped with CADD37 or 38 annotations

import os
import sys
import pandas as pd

""" 
dbNSFP SCORE file column names:

columns detected file:
    Amino_acids
    pos_ID 'Q3ZCM7_C354'
    pos_id19 or pos_id38_x

columns not detected file:
    pos_id19 or pos_id38
    Amino_acids,
    pos_ID_falseCKtarget
"""

def create_coordinate_id(df, chrr, pos, ref, alt, assembly):
    if assembly == 37:
        df.loc[:,'pos_id19'] = df[chrr].astype(str) + '_' + \
                df[pos].astype(str) + '_' + df[ref].astype(str) + \
                '_' + df[alt].astype(str)
    if assembly == 38:
        df.loc[:,'pos_id38'] = df[chrr].astype(str) + '_' + \
                df[pos].astype(str) + '_' + df[ref].astype(str) + \
                '_' + df[alt].astype(str)
    return df


def format_missense_triple(df, oaacol, naacol):
    #  A|A turns to Ala/Ala
    amino_dict = dict([('A', 'Ala'),('G', 'Gly'), ('I','Ile'), ('L','Leu'), ('P', 'Pro'), ('V','Val'), ('F','Phe'),('W', 'Trp'), ('Y', 'Tyr'), ('D','Asp'),('E','Glu'), ('R','Arg'),('H','His'), ('K','Lys'), ('S','Ser'), ('T', 'Thr'), ('C', 'Cys'), ('M', 'Met'), ('N', 'Asn'), ('Q','Gln')])
    df[oaacol].replace(amino_dict, inplace=True)
    df[naacol].replace(amino_dict, inplace=True)
    ccopy = df[naacol].copy()
    df['Amino_acids'] = df[oaacol].str.cat(ccopy, sep='/')
    return df


def filter_cadd_overlap(df, assembly):
    # [1] filter for missense only
    miss = df[df['Consequence'] == 'NON_SYNONYMOUS'].copy()
    # [2] new pos_id(assembly) to files
    if assembly == 37:
        miss = create_coordinate_id(miss, 'chr', 'pos_hg19', 'Ref', 'Alt', assembly)
    if assembly == 38:
        miss = create_coordinate_id(miss, 'chr', 'pos_hg38', 'Ref', 'Alt', assembly)
    # [3] new missense type column in 3 letter format with '/' sep {oAA, nAA}
    miss = format_missense_triple(miss, 'oAA', 'nAA')

    return miss

    # [4] concat all files from 37: DECT or NOT ... 38: DECT or NOT


def main():
    os.chdir('/Users/mariapalafox/Box Sync/CODE_DATA/dir_MAPpaper/CADDmapped/RESULT_pos_overlap_dbNSFPcoordinates')
    chrls = ['X', 'Y']
    # GRCh38
    for order in chrls:
        chrID = 'chr{}'.format(order)
        file1 = '{}_CADD_GRCh38_DETECTED_CK.csv'.format(chrID)
        out1 = 'MISSENSE_{}_CADD_GRCh38_DETECTED_CK.csv'.format(chrID)
        file2 = '{}_CADD_GRCh38_NOT_DETECTED_CK.csv'.format(chrID)
        out2 = 'MISSENSE_{}_CADD_GRCh38_NOT_DETECTED_CK.csv'.format(chrID)
        df1 = pd.read_csv(file1)
        df2 = pd.read_csv(file2)
        df1out = filter_cadd_overlap(df1, 38)
        df2out = filter_cadd_overlap(df2, 38)
        print("saving detected and not detected GRCh38 ", chrID)
        print()
        df1out.to_csv(out1, index=False)
        df2out.to_csv(out2, index=False)

main()
```