In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import random

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

In [2]:
# load the data
df_rna = pd.read_hdf("rna_scaled.hdf")
df_dnase = pd.read_hdf("dnase_scaled.hdf")
df_gene_info = pd.read_hdf("df_gene_info.hdf")

In [3]:
# a find gene list function that accept a Dnase position, a df_RNA file, a df_gene_info data frame, and a distance number, and then
# return the list of gene names for prediction
def find_gene_list(dnase_position, df_gene_info, distance):
    dnase_ls = dnase_position.split("-")
    chr_id = dnase_ls[0]
    chr_start = int(dnase_ls[1])
    chr_end = int(dnase_ls[2])
    
    # define the interval for gene selection
    gene_chr = chr_id.replace("chr", "")
    gene_start = chr_start - distance
    gene_end = chr_end + distance
    #print(gene_chr, gene_start, gene_end)
    # find the gene list for the gene interval
    df_gene_filter = df_gene_info[df_gene_info["chr"] == gene_chr]
    df_gene_filter = df_gene_filter[df_gene_filter["start"] >= gene_start]
    df_gene_filter = df_gene_filter[df_gene_filter["start"] <= gene_end]
    gene_ls = df_gene_filter.index
    return(gene_ls)

In [4]:
#take first 500 columns as samples
samples = df_dnase[list(range(500))]
samples

position,chr1-100009480-100010443,chr1-100014419-100015173,chr1-100023083-100024439,chr1-10002354-10004062,chr1-100035927-100036679,chr1-100045144-100046572,chr1-100056080-100057113,chr1-100064269-100067014,chr1-10010120-10011675,chr1-100109783-100114190,...,chr1-110518335-110520863,chr1-110521191-110523948,chr1-110525979-110530178,chr1-110536565-110536851,chr1-110538791-110540771,chr1-110546188-110547474,chr1-110553934-110555240,chr1-110572176-110573476,chr1-110573682-110574283,chr1-110576423-110578345
tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H1_BMP4_Derived_Mesendoderm_Cultured_Cells,0.0,0.0,0.0,0.873887,0.0,0.0,0.0,0.606913,0.715858,0.656221,...,0.0,0.462965,0.787131,0.0,0.0,0.890835,0.572028,0.968003,0.595887,0.74914
Penis_Foreskin_Keratinocyte_Primary_Cells_skin02,1.0,0.0,0.672303,0.765626,0.0,0.888493,0.750566,0.719029,0.662114,0.755773,...,0.764963,0.586306,0.63039,0.0,0.0,0.917945,0.58795,0.675156,0.583696,0.767934
H1_BMP4_Derived_Trophoblast_Cultured_Cells,0.0,0.0,0.0,0.62236,0.0,0.444284,0.0,0.627059,0.47004,0.595829,...,0.605684,0.742073,0.711262,0.0,0.775274,0.810631,0.79938,0.757491,0.0,0.564568
Breast_vHMEC,0.838002,0.0,0.697993,0.769016,0.0,1.0,0.963854,0.607513,0.533778,1.0,...,0.779467,0.703276,0.737806,0.51746,0.0,0.967712,0.743348,0.602562,0.0,0.768179
H1_Derived_Mesenchymal_Stem_Cells,0.680418,0.52214,0.519866,0.81377,0.0,0.0,0.598469,0.971144,0.740258,0.553962,...,0.621695,0.766537,0.799247,0.0,0.787353,0.924783,0.805039,0.919566,0.590259,0.902224
Fetal_Intestine_Small,0.0,0.0,0.0,0.768473,0.0,0.0,0.0,0.0,0.657892,0.0,...,0.0,0.464682,0.868285,0.0,0.0,0.667278,0.68192,0.710761,0.0,0.720148
Fetal_Intestine_Large,0.0,0.0,0.0,0.786875,0.0,0.0,0.0,0.0,0.450724,0.0,...,0.0,0.41564,0.911298,0.0,0.0,0.840901,0.509309,0.685165,0.0,0.710948
H1_Derived_Neuronal_Progenitor_Cultured_Cells,0.0,0.0,0.0,0.846682,0.0,0.0,0.0,0.548052,0.701233,0.0,...,0.0,0.0,0.897637,0.0,0.0,0.577373,0.0,0.827973,0.0,0.762374
Psoas_Muscle,0.0,0.0,0.0,0.869609,0.748278,0.0,0.659742,0.464115,0.563889,0.963133,...,0.960624,0.772163,0.810181,0.0,0.412506,0.8588,0.467869,0.590649,0.0,0.799403
Gastric,0.0,0.705144,0.0,0.697079,0.0,0.0,0.0,0.0,0.638925,0.750677,...,0.0,0.754702,0.767555,0.75284,0.0,0.84302,0.542439,0.679703,0.0,0.677673


## MLPC classification method

In [20]:
#for loop to run 500 sites in 
accuracy_list = [] 
for i in list(samples.columns):
    RNA_list = list(find_gene_list(i, df_gene_info, 1000000))
    #define input dataframe and output series
    input = df_rna[RNA_list]
    output = samples[i]
    output[output != 0] = 1
    #randomly split into training (80%) and testing (20%) dataset
    train_x,test_x=train_test_split(input,test_size=0.2)
    train_y = output.loc[train_x.index]
    test_y = output.loc[test_x.index]
    #train the model using MLPC method
    MLPC = MLPClassifier(solver='lbfgs', alpha=1e-5,
                         hidden_layer_sizes=(5, 2), random_state=1)
    MLPC.fit(train_x,train_y)
    MLPC.predict(test_x)
    #calculate the accuracy
    correct = 0
    pred_y = list(MLPC.predict(test_x))
    for i in range(len(test_x)):
        if pred_y[i] == test_y[i]:
            correct += 1
    accuracy =repr((correct/float(len(test_x))) * 100) + "%"
    accuracy_list.append(accuracy)

In [7]:
accuracy_list

['50.0%',
 '75.0%',
 '25.0%',
 '100.0%',
 '75.0%',
 '75.0%',
 '75.0%',
 '50.0%',
 '100.0%',
 '100.0%',
 '100.0%',
 '25.0%',
 '75.0%',
 '25.0%',
 '75.0%',
 '50.0%',
 '25.0%',
 '75.0%',
 '100.0%',
 '75.0%',
 '25.0%',
 '75.0%',
 '100.0%',
 '25.0%',
 '50.0%',
 '50.0%',
 '100.0%',
 '25.0%',
 '100.0%',
 '50.0%',
 '100.0%',
 '25.0%',
 '100.0%',
 '25.0%',
 '50.0%',
 '50.0%',
 '75.0%',
 '50.0%',
 '100.0%',
 '50.0%',
 '75.0%',
 '75.0%',
 '100.0%',
 '75.0%',
 '100.0%',
 '25.0%',
 '75.0%',
 '100.0%',
 '100.0%',
 '50.0%',
 '100.0%',
 '100.0%',
 '50.0%',
 '50.0%',
 '75.0%',
 '25.0%',
 '50.0%',
 '75.0%',
 '50.0%',
 '100.0%',
 '50.0%',
 '75.0%',
 '75.0%',
 '100.0%',
 '75.0%',
 '50.0%',
 '50.0%',
 '75.0%',
 '50.0%',
 '25.0%',
 '100.0%',
 '25.0%',
 '50.0%',
 '75.0%',
 '75.0%',
 '75.0%',
 '50.0%',
 '100.0%',
 '50.0%',
 '75.0%',
 '100.0%',
 '50.0%',
 '100.0%',
 '0.0%',
 '25.0%',
 '75.0%',
 '25.0%',
 '50.0%',
 '25.0%',
 '25.0%',
 '75.0%',
 '75.0%',
 '50.0%',
 '25.0%',
 '50.0%',
 '75.0%',
 '75.0%',
 '100.0%

In [23]:
#calculate the average accuracy for different classification methods
sum = 0
for i in accuracy_list:
    sum += float((i.split('%')[0]))
avg_accuracy = repr(sum/len(accuracy_list))+"%"
avg_accuracy

'68.5%'

## Decision Trees classification method

In [22]:
#Decision Trees
from sklearn import tree

In [24]:
#for loop to run 500 sites in 
accuracy_list = [] 
for i in list(samples.columns):
    RNA_list = list(find_gene_list(i, df_gene_info, 1000000))
    #define input dataframe and output series
    input = df_rna[RNA_list]
    output = samples[i]
    output[output != 0] = 1
    #randomly split into training (80%) and testing (20%) dataset
    train_x,test_x=train_test_split(input,test_size=0.2)
    train_y = output.loc[train_x.index]
    test_y = output.loc[test_x.index]
    #train the model using Decision Tree Classifier method
    DTC = tree.DecisionTreeClassifier()
    DTC.fit(train_x,train_y)
    DTC.predict(test_x)
    #calculate the accuracy
    correct = 0
    pred_y = list(DTC.predict(test_x))
    for i in range(len(test_x)):
        if pred_y[i] == test_y[i]:
            correct += 1
    accuracy =repr((correct/float(len(test_x))) * 100) + "%"
    accuracy_list.append(accuracy)

In [25]:
#calculate the average accuracy for different classification methods
sum = 0
for i in accuracy_list:
    sum += float((i.split('%')[0]))
avg_accuracy = repr(sum/len(accuracy_list))+"%"
avg_accuracy

'66.6%'