In [31]:
import os
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import nbformat

from utils import *
from models.baseline import *
sns.set()

In [32]:
def get_gc_count_from_ohe(train_df):
    """
    Calculates GC content in sequence for positive and negative sequences separately
    """
    X_train, y_train = get_X_y(train_df)

    pos_counts = X_train[y_train == 1].reshape(-1, 4).sum(axis=0)
    neg_counts = X_train[y_train == 0].reshape(-1, 4).sum(axis=0)

    pos_gc = (sum(pos_counts[1:3]) * 100) / sum(pos_counts)
    neg_gc = (sum(neg_counts[1:3]) * 100) / sum(neg_counts)

    return pos_gc, neg_gc

In [33]:
def process_protein(protein_path):
    protein_specs = []
    for file in os.listdir(protein_path):
        if file.endswith("tsv.gz"):
            df_type = file[:-7]
            train_path = str(protein_path) + "/" + file
            test_path = str(protein_path)[:-6] + "/test/original.tsv.gz"

            train_df = pd.read_csv(train_path, delimiter="\t", header=0, index_col=0, compression='gzip')
            test_df = pd.read_csv(test_path, delimiter="\t", header=0, index_col=0, compression='gzip')

            pos_gc, neg_gc = get_gc_count_from_ohe(train_df)
            delta_gc =  abs(pos_gc - neg_gc)

            test_acc_score, test_auc_score, baseline = baseline_sklearn(train_df, test_df)
            protein_specs.append((df_type, test_auc_score, baseline, delta_gc))

    return protein_specs


def logreg_feature_importance(baseline, df_type, test_auc_score, name, delta_gc):
    logreg = baseline
    importances = logreg.coef_.ravel()

    leg=["A", "C", "G", "T"]
    repeated_amino =  leg * (len(importances)//4) #np.kron(np.ones(len(importances)//5), np.arange(5))
    
    fig = px.bar(y=importances, x=np.arange(len(importances)), color=repeated_amino)

    
    fig.update_xaxes(title_text='Sequence lenght')
    fig.update_yaxes(title_text='Nucleotide importance')

    fig.update_layout(
        autosize=True,
        title = f"{name} {df_type}, TestAUC: {test_auc_score:.2f}, DeltaGC: {delta_gc:.2f}",
        font_family="Times New Roman",
        font_size=18,
        width=900,
        height=500,
        showlegend=True)

    fig.show()

In [34]:
PUM2_RBP31 = Path("/home/mrkvrbl/Diplomka/Data/rbp31/PARCLIP_PUM2/train")

PUM2_specs = process_protein(PUM2_RBP31)
print(PUM2_specs)

for entry in PUM2_specs:
    (df_type, AUC, baseline, delta_gc) = entry
    logreg_feature_importance(baseline, df_type, AUC, "PUM2", delta_gc)

[('sameGC', 0.669932125, LogisticRegression(max_iter=200), 0.002846534653468069), ('original', 0.8745143750000001, LogisticRegression(max_iter=200), 16.43516914191419), ('shuffled', 0.8580318124999999, LogisticRegression(max_iter=200), 16.43516914191419)]


In [42]:
PUM2_RBP24 = Path("/home/mrkvrbl/Diplomka/Data/rbp24/processed/PARCLIP_PUM2/train")

PUM2_specs = process_protein(PUM2_RBP24)
print(PUM2_specs)

for entry in PUM2_specs:
    (df_type, AUC, baseline, delta_gc) = entry
    logreg_feature_importance(baseline, df_type, AUC, "PUM2", delta_gc)

[('sameGC', 0.7588519999999999, LogisticRegression(max_iter=200), 0.04547088487767681), ('original', 0.866864, LogisticRegression(max_iter=200), 12.371583768440253), ('shuffled', 0.8204840000000001, LogisticRegression(max_iter=200), 12.371583768440253)]


In [35]:
ELAVL1_RBP31 = Path("/home/mrkvrbl/Diplomka/Data/rbp31/PARCLIP_ELAVL1/train")

ELAVL1_specs = process_protein(ELAVL1_RBP31)
print(ELAVL1_specs)

for entry in ELAVL1_specs:
    (df_type, AUC, baseline, delta_gc) = entry
    logreg_feature_importance(baseline, df_type, AUC, "ELAVL1", delta_gc)

[('sameGC', 0.6482538125, LogisticRegression(max_iter=200), 0.05301155115511591), ('original', 0.8604459999999999, LogisticRegression(max_iter=200), 14.052268976897693), ('shuffled', 0.8424433125, LogisticRegression(max_iter=200), 14.052268976897693)]


In [36]:
QKI_RBP31 = Path("/home/mrkvrbl/Diplomka/Data/rbp31/PARCLIP_QKI/train")

QKI_specs = process_protein(QKI_RBP31)
print(QKI_specs)

for entry in QKI_specs:
    (df_type, AUC, baseline, delta_gc) = entry
    logreg_feature_importance(baseline, df_type, AUC, "QKI", delta_gc)

[('sameGC', 0.6306451875000001, LogisticRegression(max_iter=200), 0.05441419141914139), ('original', 0.8137301250000001, LogisticRegression(max_iter=200), 13.09719471947195), ('shuffled', 0.7975592499999999, LogisticRegression(max_iter=200), 13.09719471947195)]


In [37]:
MOV10_RBP24 = Path("/home/mrkvrbl/Diplomka/Data/rbp24/processed/PARCLIP_MOV10_Sievers/train")

MOV10_specs = process_protein(MOV10_RBP24)
print(MOV10_specs)

for entry in MOV10_specs:
    (df_type, AUC, baseline, delta_gc) = entry
    logreg_feature_importance(baseline, df_type, AUC, "MOV10", delta_gc)

[('sameGC', 0.615812, LogisticRegression(max_iter=200), 0.053384281788268595), ('original', 0.7191400000000001, LogisticRegression(max_iter=200), 7.945780153019726), ('shuffled', 0.70424, LogisticRegression(max_iter=200), 7.945780153019726)]


In [38]:
HNRNPC_RBP24 = Path("/home/mrkvrbl/Diplomka/Data/rbp24/processed/ICLIP_HNRNPC/train")

HNRNPC_specs = process_protein(HNRNPC_RBP24)
print(HNRNPC_specs)

for entry in HNRNPC_specs:
    (df_type, AUC, baseline, delta_gc) = entry
    logreg_feature_importance(baseline, df_type, AUC, "HNRNPC", delta_gc)

[('sameGC', 0.904088, LogisticRegression(max_iter=200), 0.00257139767415282), ('original', 0.9069959999999999, LogisticRegression(max_iter=200), 5.5240465521614155), ('shuffled', 0.6423719999999999, LogisticRegression(max_iter=200), 5.5240465521614155)]


In [39]:
elF4AIII_RBP31 = Path("/home/mrkvrbl/Diplomka/Data/rbp31/CLIPSEQ_elF4AIII_2/train")

elF4AIII_specs = process_protein(elF4AIII_RBP31)
print(elF4AIII_specs)

for entry in elF4AIII_specs:
    (df_type, AUC, baseline, delta_gc) = entry
    logreg_feature_importance(baseline, df_type, AUC, "elF4AIII", delta_gc)

[('sameGC', 0.5478076249999999, LogisticRegression(max_iter=200), 0.0428107378258602), ('original', 0.54969775, LogisticRegression(max_iter=200), 0.38764039937017003), ('shuffled', 0.5058195625, LogisticRegression(max_iter=200), 0.38764039937017003)]


In [40]:
len("CAAGCAGGTGGGGACTCCCTCCCCGTGAATGGAGGCCACCCTTCCATCCATGGCATACGTGAACAAGAAACTGTTTTCACCTTTACGGATAGTAGCGTCAGCAGTAGTGTCTGAGACTCCATCATTTC")

128