In [1]:
# Setup
import pandas as pd

csv_df = pd.read_csv("dpyd_variants.csv")

pathogenic_signatures = ["pathogenic", "drug_response", "Likely_pathogenic", "Pathogenic/Likely_pathogenic"]

In [2]:
# Series Creation
genes = ["CHD5", "CLIC4", "CRYZ", "DVL1", "ENO1", "GBP1", "RHOC"]

gene_series = pd.Series(genes)

In [3]:
# Series Access
print(gene_series.iloc[:2])
print()
print(gene_series.iloc[1:4])
print()
print(gene_series.iloc[-2:])
print()
print(gene_series.iloc[-3])

0     CHD5
1    CLIC4
dtype: object

1    CLIC4
2     CRYZ
3     DVL1
dtype: object

5    GBP1
6    RHOC
dtype: object

ENO1


In [4]:
# Dataframe Creation

# List of dicts
variant_dicts = [
    {"Variant ID": "1-97544568-CTT-C", "LOF": "LC", "AC": 1, "AN": 251168},
    {"Variant ID": "1-97544576-C-A", "LOF": "LC", "AC": 5, "AN": 251214},
    {"Variant ID": "1-97544578-T-TA", "LOF": "LC", "AC": 2, "AN": 251218},
    {"Variant ID": "1-97544673-CA-C", "LOF": "HC", "AC": 1, "AN": 251346},
]
variants = pd.DataFrame(variant_dicts)
# Dict of lists
dict_variants = {
    "Variant ID": [
        "1-97544568-CTT-C",
        "1-97544576-C-A",
        "1-97544578-T-TA",
        "1-97544673-CA-C",
    ],
    "LOF": ["LC", "LC", "LC", "HC"],
    "AC": [1, 5, 2, 1],
    "AN": [251168, 251214, 251218, 251346],
}
variants = pd.DataFrame(dict_variants)

In [10]:
# Dataframe Access
variants.set_index("Variant ID", inplace=True)
# or variants = variants.set_index("Variant ID") if not using inplace

print(variants[["AC", "AN"]])
print()
print(variants.iloc[1:3])
print()
print(variants.loc["1-97544568-CTT-C"])
print()
print(variants.iloc[:2][["AC", "AN"]])


                  AC      AN
Variant ID                  
1-97544568-CTT-C   1  251168
1-97544576-C-A     5  251214
1-97544578-T-TA    2  251218
1-97544673-CA-C    1  251346

                LOF  AC      AN
Variant ID                     
1-97544576-C-A   LC   5  251214
1-97544578-T-TA  LC   2  251218

LOF        LC
AC          1
AN     251168
Name: 1-97544568-CTT-C, dtype: object

                  AC      AN
Variant ID                  
1-97544568-CTT-C   1  251168
1-97544576-C-A     5  251214


In [11]:
# Dataframe Modification
def to_percentage(proportion):
    return proportion * 1000000


variants["CLIN_SIG"] = ["None", "None", "Uncertain", "None"]
variants["AF"] = variants["AC"] / variants["AN"]
variants["AF"] = variants["AF"].apply(to_percentage)
variants.sort_values(by = ["AF"], inplace = True)

In [14]:
# Undertanding a Dataset
csv_df.describe() # 1
print(csv_df["LOF"].unique()) # nan, LC, HC, OS
print(csv_df.shape) # 26

[nan 'LC' 'HC' 'OS']
(76376, 26)


In [19]:
# Dataframe Filtering
csv_df["AF"] = csv_df["AC"] / csv_df["AN"]

common = csv_df["AF"] > 0.01

common_df = csv_df[common]

problematic = (csv_df["FILTER"] == "None") & (
    (csv_df["CLIN_SIG"].isin(pathogenic_signatures))
    | (csv_df["LOF"] == "HC")
    | (csv_df["INESSS"] == "True")
)

problematic_df = csv_df[problematic]

In [27]:
# Final Exercise
variants = pd.read_csv("dpyd_variants.csv")

variants = variants[variants["FILTER"] == "None"]

pathogenic_signatures = ["pathogenic", "drug_response", "Likely_pathogenic", "Pathogenic/Likely_pathogenic"]
lofs = ["HC", "LC"]

pathogenicity = (variants["INESSS"] == "True") | (variants["CLIN_SIG"].isin(pathogenic_signatures))| (variants["LOF"].isin(lofs))
variants = variants[pathogenicity]

variants["AF"] = variants["AC"]/ variants["AN"]

populations = ["eas", "afr", "amr", "asj", "sas", "nfe", "fin"]
for population in populations:
    variants["AF_" + population] = variants["AC_" + population]/variants["AN_" + population]

variants.to_csv("filtered_variants.csv")