In [1]:
# import packages
import pandas as pd
import numpy as np
import json 
from collections import Counter
import matplotlib.pyplot as plt
import os

# initial model 
import xgboost as xgb
from xgboost import XGBClassifier 
from math import ceil

from sklearn import preprocessing



---

In [2]:
# load prev model
model = XGBClassifier()
model.load_model('./output/xgb.model')

In [3]:
folder_names = []
for entry_name in os.listdir('./data/m6anet'):
    entry_path = os.path.join('./data/m6anet', entry_name)
    if os.path.isdir(entry_path):
        folder_names.append(entry_name)

In [4]:
for fname in folder_names:
    f1 = "./data/m6anet/{fname}/data.json".format(fname=fname)

    print("[INFO] Preprocessing data {fname}...".format(fname=fname))
    temp_lst = list()
    with open(f1, 'rb') as f:
        for line in f:
            temp1 = json.loads(line)
            for gene_id, reads in temp1.items():
                for read_pos, trans_vals in reads.items():
                    for trans_id, vals in trans_vals.items():
                        for val in vals:
                            row1 = (gene_id, read_pos, trans_id, val)
                            temp_lst.append(row1)

    df = pd.DataFrame(temp_lst, columns=['transcript_id', 'transcript_position', 'nucleotides', 'val'])
    temp = pd.DataFrame(df.val.tolist(), index=df.index,  columns=['0', '1', '2', '3', '4', '5', '6', '7', '8'])

    df = pd.concat([df, temp], axis=1)
    df.drop(columns=['val'], inplace=True)
    df = df.astype({'transcript_position': 'int'})

    # drop duplicates
    df.drop_duplicates(keep='first', inplace=True, ignore_index=True)

    # groupby and agg using mean 
    df = df.groupby(by=['transcript_id', 'transcript_position', 'nucleotides']).mean(numeric_only=True)
    df.reset_index(inplace=True)

    # save original `transcript_position`
    df['transcript_position0'] = df['transcript_position']
    df = df.astype({'transcript_position0': 'str'})

    # scale numerical variables
    num_var = df.select_dtypes(include=np.number).columns
    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
    df[num_var]= scaler.fit_transform(df[num_var])

    # seperate `nucleotides`
    tmp = df['nucleotides'].str.split(pat="", expand=True)
    tmp.drop(columns=[tmp.columns[0], tmp.columns[-1]], inplace=True)
    tmp = tmp.add_prefix("p")
    tmp = pd.get_dummies(tmp)
    df = pd.concat([df, tmp], axis=1)

    # prepare features
    Xte = df[['transcript_position', '0', '1', '2', '3', '4', '5', '6', '7', '8'] + list(tmp.columns)]
    
    print("[INFO] Running inference...")
    yhat_probs = model.predict_proba(Xte, iteration_range=(0, model.best_iteration + 1))
    yhat = model.predict(Xte, iteration_range=(0, model.best_iteration + 1))
    yhat1_probs = yhat_probs[:, 1]
    df['score'] = yhat1_probs
    df['pred_label'] = yhat 

    print("[INFO] Saving predictions...")
    # df_res = df[['transcript_id', 'transcript_position0', 'score']].copy()
    # df_res.rename(columns={"transcript_position0": "transcript_position"}, inplace=True)
    df_res = df.copy()
    df_res.to_csv("./output/teamgenono_{fname}_1.csv".format(fname=fname), index=False)
    

[INFO] Preprocessing data SGNex_A549_directRNA_replicate5_run1...
[INFO] Running inference...
[INFO] Saving predictions...
[INFO] Preprocessing data SGNex_A549_directRNA_replicate6_run1...
[INFO] Running inference...
[INFO] Saving predictions...
[INFO] Preprocessing data SGNex_Hct116_directRNA_replicate3_run1...
[INFO] Running inference...
[INFO] Saving predictions...
[INFO] Preprocessing data SGNex_Hct116_directRNA_replicate3_run4...
[INFO] Running inference...
[INFO] Saving predictions...
[INFO] Preprocessing data SGNex_Hct116_directRNA_replicate4_run3...
[INFO] Running inference...
[INFO] Saving predictions...
[INFO] Preprocessing data SGNex_HepG2_directRNA_replicate5_run2...
[INFO] Running inference...
[INFO] Saving predictions...
[INFO] Preprocessing data SGNex_HepG2_directRNA_replicate6_run1...
[INFO] Running inference...
[INFO] Saving predictions...
[INFO] Preprocessing data SGNex_K562_directRNA_replicate4_run1...
[INFO] Running inference...
[INFO] Saving predictions...
[INFO] P

--- 
EDA on results of M6anet cell lines

In [5]:
file_names = []
for entry_name in os.listdir('./output'):
    entry_path = os.path.join('./output', entry_name)
    if (os.path.isfile(entry_path)):
        if os.path.splitext(entry_path)[1] == '.csv':
            file_names.append(entry_path)

file_names

['./output\\teamgenono_SGNex_A549_directRNA_replicate5_run1_1.csv',
 './output\\teamgenono_SGNex_A549_directRNA_replicate6_run1_1.csv',
 './output\\teamgenono_SGNex_Hct116_directRNA_replicate3_run1_1.csv',
 './output\\teamgenono_SGNex_Hct116_directRNA_replicate3_run4_1.csv',
 './output\\teamgenono_SGNex_Hct116_directRNA_replicate4_run3_1.csv',
 './output\\teamgenono_SGNex_HepG2_directRNA_replicate5_run2_1.csv',
 './output\\teamgenono_SGNex_HepG2_directRNA_replicate6_run1_1.csv',
 './output\\teamgenono_SGNex_K562_directRNA_replicate4_run1_1.csv',
 './output\\teamgenono_SGNex_K562_directRNA_replicate5_run1_1.csv',
 './output\\teamgenono_SGNex_K562_directRNA_replicate6_run1_1.csv',
 './output\\teamgenono_SGNex_MCF7_directRNA_replicate3_run1_1.csv',
 './output\\teamgenono_SGNex_MCF7_directRNA_replicate4_run1_1.csv']

In [10]:
df1 = pd.read_csv(file_names[3])

df1.drop(columns = ['transcript_position'])
df1.rename(columns={"transcript_position0": "transcript_position"}, inplace=True)


df1.pred_label.value_counts()

0    742503
1    560011
Name: pred_label, dtype: int64

In [11]:
df1.columns

Index(['transcript_id', 'transcript_position', 'nucleotides', '0', '1', '2',
       '3', '4', '5', '6', '7', '8', 'transcript_position', 'p1_A', 'p1_C',
       'p1_G', 'p1_T', 'p2_A', 'p2_G', 'p2_T', 'p3_A', 'p3_G', 'p4_A', 'p5_C',
       'p6_A', 'p6_C', 'p6_T', 'p7_A', 'p7_C', 'p7_G', 'p7_T', 'score',
       'pred_label'],
      dtype='object')

In [None]:
df1.filter()