In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

#Visualization Tools

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# AI Workflow Module
from model_classifier import ModelClassifier, CustomClassifier

# Models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Use for saving model 
import joblib

# Environment Parameters

plt.rcParams['figure.figsize'] = [15, 8]
plt.rcParams['figure.dpi'] = 100 
plt.style.use('ggplot')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# Get PPI genes
ppi_features_df = pd.read_csv('../data/skcm_ppi_betweenness_centrality.csv', index_col=0)

top10_ppi_features = ppi_features_df.betweenness_centrality.head(10).index.values
top20_ppi_features = ppi_features_df.betweenness_centrality.head(20).index.values


In [3]:
# Get RF genes
rf_features_df = pd.read_csv('../data/Melanoma_RF_weights_all_genomic_data.csv',index_col=0)

top10_rf_features = rf_features_df.weights.head(10).index.values
top20_rf_features = rf_features_df.weights.head(20).index.values
top30_rf_features = rf_features_df.weights.head(30).index.values

In [12]:
genes = set(top10_ppi_features) | set(top10_rf_features) | set(top20_rf_features)

In [13]:
genes = list(genes)

In [14]:
len(genes)

26

In [15]:
genes

['S100A7A',
 'RPN2',
 'KRT6B',
 'SERPINB4',
 'IL20RB',
 'AFAP1-AS1',
 'WFDC5',
 'KRTDAP',
 'PVRL4',
 'GNG2',
 'KRT14',
 'KRT17',
 'PAX1',
 'GSR',
 'C7',
 'ZSWIM7',
 'ZNF653',
 'FKBP1B',
 'PRG2',
 'CLEC2A',
 'RPS28',
 'PC',
 'S100A7',
 'GRIK5',
 'DMBT1',
 'TSHR']

In [8]:
(set(genes) & set(top20_rf_features)) - (set(genes) & set(top10_ppi_features))

{'AFAP1-AS1',
 'CLEC2A',
 'DMBT1',
 'FKBP1B',
 'IL20RB',
 'KRT14',
 'KRT17',
 'KRT6B',
 'KRTDAP',
 'PAX1',
 'PRG2',
 'PVRL4',
 'S100A7A',
 'WFDC5',
 'ZNF653',
 'ZSWIM7'}

In [11]:
set(ppi_features_df.betweenness_centrality.head(10).index.values) - set(top30_rf_features) 

{'GNG2', 'GSR', 'PC', 'RPN2', 'RPS28', 'TSHR'}