In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import zscore

In [2]:
archs4_folder = "../data/processed/ARCHS4/human_matrix_v9_filtered_n50000x962_v2.f"
l1000_folder = "../data/processed/L1000/L1000_filtered_GSE92742_Broad_LINCS_Level3_INF_mlr12k_n50000x962.f"

In [3]:
def load_feather(filename):
    df = pd.read_feather(filename)
    first_col = df.columns.tolist()[0]
    df = df.set_index(first_col)
    df = df.sort_index(axis=1)
    return df

In [4]:
archs4 = load_feather(archs4_folder)
archs4 = archs4.apply(zscore, axis=0).dropna()

In [5]:
l1000 = load_feather(l1000_folder)
l1000 = l1000.apply(zscore, axis=0).dropna()

In [6]:
data = pd.concat([archs4, l1000])

In [7]:
data = data.sample(frac=1.0)


In [8]:
data

Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
CPC008_A375_6H_X1_F1B3_DUO52HI53LO:H17,-0.358790,-0.626178,-0.738544,0.773538,-1.024874,-1.018788,0.898734,0.082818,-1.217774,-0.467397,...,-0.016432,-0.237037,-0.020143,0.881456,-0.573254,-1.041896,-0.742194,-2.112831,0.384645,-0.846711
T1D.KD001_JURKAT_120H_X1_B10_DUO52HI53LO:B22,-1.403949,-1.466247,-0.909086,1.790770,-1.051098,-0.784776,1.181466,0.044839,-2.110222,1.881759,...,-0.036054,-1.237452,-0.016997,-0.932600,-0.413618,0.733563,-0.280130,-0.253415,3.189750,-0.027601
GSM2304937,1.483362,1.894040,0.694559,0.880314,0.576766,0.498788,0.415034,1.182024,0.087221,0.633973,...,-0.156906,-0.244588,0.295824,-0.140279,0.698450,0.947065,-0.105663,-0.377851,0.759849,0.666147
GSM4408987,0.537118,-0.341909,0.490139,0.253987,0.163466,-0.808488,-0.817045,0.372562,-0.635761,-0.415704,...,0.245924,1.284388,-0.292599,-0.305151,1.018099,-0.759002,0.779595,1.275807,0.373402,0.741605
GSM2265719,-1.478540,-0.539104,0.197705,-1.232146,-0.875859,-1.513142,0.723539,-1.272064,-1.242418,-0.306064,...,-0.182044,0.280336,1.015451,-0.785176,0.134675,-0.750953,2.249211,0.972416,-0.027056,1.124622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KDB005_HCC515_96H_X3_B6_DUO52HI53LO:M01,0.807889,-0.290514,-0.013510,0.390941,0.611670,-0.790281,-0.166825,-0.254860,-0.444631,0.283527,...,-0.446013,-1.346514,-1.357751,0.162656,0.489104,0.901850,0.311351,-1.017345,-0.166115,-0.911441
GSM4103441,-1.657700,-1.048160,1.587296,-0.283071,-0.800525,-0.115351,-0.063229,-1.111236,-0.014098,-2.292202,...,0.447524,1.867779,1.073860,1.138972,0.461792,-0.913317,1.721876,0.650826,0.760996,-0.792830
LJP002_MCF7_24H_X3_F2B5_DUO52HI53LO:M05,0.814079,-0.231900,1.014575,0.264531,2.174256,1.321232,-0.454527,0.767652,0.800079,0.052894,...,0.568111,2.007714,-0.035327,0.823463,0.449821,0.276466,-0.078466,-0.771344,-0.714528,-0.528523
GSM4432278,0.612681,0.497314,0.143333,0.301858,-0.631947,0.908582,-0.629556,0.322752,-0.138247,1.189047,...,-0.081977,0.649568,0.694886,0.768956,0.100094,0.386716,-0.433980,0.490714,1.606894,-0.180315


In [9]:
clf = RandomForestClassifier(n_jobs=-1)

In [10]:
train_data, test_data = train_test_split(data, test_size=0.1)
train_label = ["ARCHS4" if x.startswith("GSM") else "L1000" for x in train_data.index]
test_label = ["ARCHS4" if x.startswith("GSM") else "L1000" for x in test_data.index]

In [11]:
clf.fit(train_data, train_label)

RandomForestClassifier(n_jobs=-1)

In [12]:
clf.score(test_data, test_label)

1.0

In [13]:
feature_importance = dict(zip(data.columns, clf.feature_importances_))

In [14]:
sum = 0
with open("../data/processed/gene_weights.txt", "w") as f:
    for key in sorted(feature_importance.keys()):
        value = feature_importance[key]
        sum += value
        f.write(key)
        f.write("\t")
        f.write(str(1+value))
        f.write("\n")
        f.flush()

In [81]:
sum

1.0