In [None]:
def get_panel_df(sheet):
    # load sheet
    df = pd.read_excel("/Users/martinszyska/Dropbox/Icke/Work/Topics/CONAN/LungCancerPanels.xlsx", sheet_name=sheet)
    # extract Gene + ID
    df.loc[:, ['Gene', 'ID']] = df['Gen Transkript'].str.strip(" ").str.extract(r"(?P<Gene>[A-Z0-9]+) +(?P<ID>NM_[0-9]+)")
    df.loc[:, "Exon"] = df['Genbereiche'].str.extract(r"Exon ?(?P<Exon>[^A-Za-z]+)", expand=False).str.replace("[ +]", "", regex=True)
    df.loc[:, "Intron"] = df['Genbereiche'].str.extract(r"Intron ?(?P<Intron>[^A-Za-z]+)", expand=False).str.replace("[ +]", "", regex=True).fillna("")
    return df.loc[:, ["Gene", "ID", "Exon", "Intron"]]

In [None]:
nngn = get_panel_df("nNGM")
nngn[:3]

In [None]:
oncomine = get_panel_df("Oncomine Focus")
oncomine[:3]

# GET COSMIC DATA into the mix

### first get the RefSeq data

In [None]:
ens = pd.read_csv("/Users/martinszyska/Dropbox/Icke/Work/10x/ref/RefSeq_Exons.csv", sep="\t")
# remove the versions
for col in ["ID", "ENSID"]:
    ens[col] = ens[col].str.replace("\.[0-9]+$", "", regex=True)
ens.loc[:, 'Exon'] = "Exon" + ens["Exon"].astype(str)
ens

In [None]:
cosmic = pd.read_csv("/Users/martinszyska/Dropbox/Icke/Work/static/annotation/annovar/humandb/hg38_cosmic94.txt", sep="\t").iloc[:, [0,1,2,3,4,5,8]].rename({"#Chr":"Chr"}, axis=1)
cosmic[:3]

In [None]:
def expand_range(string):
    splt = string.split('-')
    if len(splt) == 1:
        return splt
    
    return list(range(int(splt[0]), int(splt[1]) + 1))


def get_rows(row):
    dfs = []
    for t in ["Exon", 'Intron']:
        ll = [expand_range(n) for n in row[t].split(",")]
        l = [f"{t}{x}" for i in ll for x in i]
        if l != ["Intron"]:
            df = pd.DataFrame({'Exon':l})
            dfs.append(df)
    df = pd.concat(dfs)
    # add data
    for col in ["Gene", "ID"]:
        df[col] = row[col]    
    return df.loc[:, ["Gene", "ID", "Exon"]]


def expand_df(df, ens_df):
    dfs = []
    for i, row in df.iterrows():
        dfs.append(get_rows(row))
    df = pd.concat(dfs)
    df.loc[:, "Exon"] = df["Exon"].str.replace("*", "", regex=False)
    return df.merge(ens_df, how="left")

In [None]:
nngn_df = expand_df(nngn, ens)
nngn_df

In [None]:
onco_df = expand_df(oncomine, ens)
onco_df.query("Chr != Chr")
onco_df

### merge with the relevant cosmic mutations

In [None]:
def cosmic_exon(row):
    chrom = row['Chr']
    start = row['Start']
    end = row['End']
    df = cosmic.query("Chr == @chrom and Start >= @start and End <= @end")
    df['ID'] = row["ID"]
    df["Gene"] = row['Gene']
    df["Exon"] = row['Exon']
    return df

### merge all exons for cosmic file

In [None]:
all_df = pd.concat([onco_df, nngn_df]).drop_duplicates(["ID", "Exon"]).sort_values(['Gene', 'Exon', 'ID'])
all_df

In [None]:
cosmic_exon(onco_df.iloc[0,:])

In [None]:
cos_dfs = []
for _, row in all_df.iterrows():
    cos_dfs.append(cosmic_exon(row))
cos_df = pd.concat(cos_dfs).sort_values(['Gene', 'Start', 'End'])
cos_df

### calculate the clinscore

In [None]:
from yaml import CLoader as Loader, load

def load_scores(clinscore_file):
    '''
    load the relevant clinscore files into location and type dictionary
    '''
    
    with open(clinscore_file, "r") as stream:
        cosmic_score = load(stream, Loader=Loader)
    return cosmic_score['type'], cosmic_score['location']




def get_cosmic_score(df, clinscore_file):
    '''
    computes the clinscore from a clinscore YAML file
    '''
    
    type_score, loc_score = load_scores(clinscore_file)
    print(type_score)
    
    def cosmic_score(row):
        """
        row-wise computation of cosmic90 scores
        """
        cos_score = 1 + type_score.get(row["types"], 0) + loc_score.get(row["location"], 0)
    
        return cos_score* int(row["count"])
    cosmic90_pattern = (r"(?P<count>[0-9]+)x\((?P<types>[^0-9@)]+)(?:@(?P<location>[^0-9@)]+))?\)")
    df["cosmic_score"]= df['cosmic94_type'].str.replace("_(sclerosing_haemangioma)", "", regex=False).str.extractall(cosmic90_pattern).apply(cosmic_score, axis=1).reset_index().drop(columns="match").groupby("level_0").sum().fillna(0).astype(int)
    return df

In [None]:
clinscore_file = "/Users/martinszyska/Dropbox/Icke/Work/Topics/CONAN/clinscoreLung2.yaml"

cosmic_all = get_cosmic_score(cos_df, clinscore_file)
cosmic_all.loc[:, "cosmic_score"] = cosmic_all['cosmic_score'].fillna(0).astype(int)
cosmic_all

In [None]:
cosmic_all.query('cosmic_score > 10000')

In [None]:
cosmic_all.to_csv("/Users/martinszyska/Dropbox/Icke/Work/Topics/CONAN/cosmic_AllPanels2.csv", sep="\t", index=False)

In [None]:
cosmic_all.query('cosmic_score > 10000').set_index(["Gene", "ID", "Exon"]).to_excel("/Users/martinszyska/Dropbox/Icke/Work/Topics/CONAN/cosmic_AllPanels_select.xlsx")

### calculate the sum per exon/ID

In [None]:
cosmic_exons = cosmic_all.groupby(["Gene", "Exon", "ID"])['cosmic_score'].sum().reset_index().sort_values(['Gene', "ID", "Exon"])
cosmic_exons

In [None]:
cosmic_exons[:3]

### merge with Oncomine

In [None]:
onco_df[:3]

In [None]:
cosmic_merge = cosmic_exons.merge(onco_df.loc[:,["ID", "Exon"]], how="left", indicator=True).rename({"_merge":"Oncomine"}, axis=1)
cosmic_merge.loc[:, ["Oncomine"]] = (cosmic_merge["Oncomine"] == "both").astype(int)
cosmic_merge

### merge with nNGM

In [None]:
nngn_df[:3]

In [None]:
cosmic_merge = cosmic_merge.merge(nngn_df.loc[:,["ID", "Exon"]], how="left", indicator=True).rename({"_merge":"nNGM"}, axis=1)#
cosmic_merge.loc[:, ["nNGM"]] = (cosmic_merge["nNGM"] == "both").astype(int)
cosmic_merge.loc[:, ["both"]] = (cosmic_merge["nNGM"] +  cosmic_merge["Oncomine"] == 2).astype(int)
cosmic_merge

In [None]:
cosmic_merge.sort_values(["Gene", "ID", "Exon"]).to_excel("/Users/martinszyska/Dropbox/Icke/Work/Topics/CONAN/cosmic_AllPanels_exons.xlsx", index=False)