# Preparation of variant effect score data

Variant effect scores are numerical values or metrics used to quantitatively assess the potential functional impact of genetic variants on genes or proteins. These scores are based on computational predictions which estimate how likely a genetic variant is to have a deleterious (pathogenic) effect.

Genetic variant effect score and prediction data were retrieved for the variants identified in-house from [PredictSNP2](https://loschmidt.chemi.muni.cz/predictsnp2/). The retrieved data is stored at `Data/Raw/PredictSNP2`.

The data was prepared for analysis by: 
1. Selecting features of interest, such as variant ID, position, reference and alternate allele, and consequence scores and predictions
2. Adding additional features including information on the gene in which a particular variant was found

## Import libraries and modules

In [1]:
import os

os.chdir(
    r"C:\Users\User\Desktop\Megan\MSC2\Results\5._Posthoc_analysis\Pipeline_GnomAD_14032023\Notebooks"
)

In [2]:
import sys

sys.path.append(
    r"C:\Users\User\Desktop\Megan\MSC2\Results\5._Posthoc_analysis\Pipeline_GnomAD_14032023"
)
import pandas as pd
import Utils.constants as constants
import Utils.functions as functions
import numpy as np

## Import PredictSNP2 variant effect data

In [3]:
pdtsnp2_vep_data = pd.DataFrame()
vep_path = os.path.join(
    constants.HOME_PATH, "Data", "Raw", "PredictSNP2", "PredictSNP2.csv"
)
if os.path.exists(vep_path):
    pdtsnp2_vep_data = pd.read_csv(vep_path, sep=";")

pdtsnp2_vep_data.head(5)

Unnamed: 0,CHROM,POS,ID,REF,ALT,CONS,EXON,SGNF,DBSNP,HAPLO,...,DANNC,FATE,FATS,FATC,FUNE,FUNS,FUNC,GWAVAE,GWAVAS,GWAVAC
0,13,110148891,.,C,G,downstream,,.,.,.,...,0.82,neutral,0.18997,0.77,.,.,.,neutral,0.3,0.68
1,13,110148917,rs59409892,C,G,downstream,,.,http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?se...,http://www.broadinstitute.org/mammals/haploreg...,...,0.62,neutral,0.18963,0.77,neutral,8.51599164992024E-6,0.80,neutral,0.32,0.71
2,13,110148920,.,G,C,downstream,,.,.,.,...,0.65,neutral,0.21184,0.75,?,0.622785975437062,0.49,neutral,0.32,0.71
3,13,110148959,rs56406633,A,G,UTR3,,Likely benign,http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?se...,http://www.broadinstitute.org/mammals/haploreg...,...,0.71,deleterious,0.82375,0.82,neutral,7.52439751165574E-20,0.80,neutral,0.44,0.53
4,13,110148971,.,G,C,UTR3,,.,.,.,...,0.65,deleterious,0.95264,0.91,deleterious,1.19368630042138,0.74,neutral,0.44,0.53


## Select features of interest
Select variant ID, position, reference and alternate allele, and consequence score and prediction data from various algorithms including FATHMM (FAT), CADD, PredictSNP (PSNP), DANN, FunSeq2 (FUN), and GWAVA. 

In [4]:
pdtsnp2_vep_data = pdtsnp2_vep_data.copy()[
    [
        "CHROM",
        "POS",
        "ID",
        "REF",
        "ALT",
        "SGNF",
        "PSNPE",
        "PSNPS",
        "PSNPC",
        "CADDE",
        "CADDS",
        "CADDC",
        "DANNE",
        "DANNS",
        "DANNC",
        "FATE",
        "FATS",
        "FATC",
        "FUNE",
        "FUNS",
        "FUNC",
        "GWAVAE",
        "GWAVAS",
        "GWAVAC",
    ]
]

pdtsnp2_vep_data.head(5)

Unnamed: 0,CHROM,POS,ID,REF,ALT,SGNF,PSNPE,PSNPS,PSNPC,CADDE,...,DANNC,FATE,FATS,FATC,FUNE,FUNS,FUNC,GWAVAE,GWAVAS,GWAVAC
0,13,110148891,.,C,G,.,neutral,-1.0,0.88,neutral,...,0.82,neutral,0.18997,0.77,.,.,.,neutral,0.3,0.68
1,13,110148917,rs59409892,C,G,.,neutral,-0.319012,0.74,deleterious,...,0.62,neutral,0.18963,0.77,neutral,8.51599164992024E-6,0.80,neutral,0.32,0.71
2,13,110148920,.,G,C,.,neutral,-0.320755,0.74,deleterious,...,0.65,neutral,0.21184,0.75,?,0.622785975437062,0.49,neutral,0.32,0.71
3,13,110148959,rs56406633,A,G,Likely benign,deleterious,0.456207,0.91,deleterious,...,0.71,deleterious,0.82375,0.82,neutral,7.52439751165574E-20,0.80,neutral,0.44,0.53
4,13,110148971,.,G,C,.,neutral,-0.155078,0.73,neutral,...,0.65,deleterious,0.95264,0.91,deleterious,1.19368630042138,0.74,neutral,0.44,0.53


## Handle missing values
Replace missing values, currently represented with decimal points, with Numpy NaN values

In [5]:
pdtsnp2_vep_data = pdtsnp2_vep_data.replace(".", np.NaN)

## Add features
Add information on the gene in which a particular variant was found

In [6]:
# Import gene and positional data from CSV file
gene_positional_data = pd.read_csv(
    os.path.join(
        constants.HOME_PATH,
        "Data",
        "Processed",
        "Variant_consequences.csv",
    ),
    usecols=["ID", "REF", "ALT", "POS", "GENE"],
)

# Merge gene and positional data with frequency information
pdtsnp2_vep_data = (
    pdtsnp2_vep_data.merge(
        gene_positional_data,
        how="left",
        on=[
            "POS",
            "REF",
            "ALT",
        ],
    )
    .drop(columns=["ID_x"])
    .rename(columns={"ID_y": "ID"})
)

pdtsnp2_vep_data.head(5)

Unnamed: 0,CHROM,POS,REF,ALT,SGNF,PSNPE,PSNPS,PSNPC,CADDE,CADDS,...,FATS,FATC,FUNE,FUNS,FUNC,GWAVAE,GWAVAS,GWAVAC,ID,GENE
0,13,110148891,C,G,,neutral,-1.0,0.88,neutral,2.258,...,0.18997,0.77,,,,neutral,0.3,0.68,rs552586867,COL4A1
1,13,110148917,C,G,,neutral,-0.319012,0.74,deleterious,9.461,...,0.18963,0.77,neutral,8.51599164992024e-06,0.8,neutral,0.32,0.71,rs59409892,COL4A1
2,13,110148920,G,C,,neutral,-0.320755,0.74,deleterious,9.779,...,0.21184,0.75,?,0.622785975437062,0.49,neutral,0.32,0.71,rs535182970,COL4A1
3,13,110148959,A,G,Likely benign,deleterious,0.456207,0.91,deleterious,13.69,...,0.82375,0.82,neutral,7.52439751165574e-20,0.8,neutral,0.44,0.53,rs56406633,COL4A1
4,13,110148971,G,C,,neutral,-0.155078,0.73,neutral,6.45,...,0.95264,0.91,deleterious,1.19368630042138,0.74,neutral,0.44,0.53,rs568536001,COL4A1


## Save data to CSV file

In [7]:
pdtsnp2_vep_data.reset_index(drop=True).to_csv(
    os.path.join(
        constants.HOME_PATH,
        "Data",
        "Processed",
        "Variant_effects.csv",
    )
)