In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
X = pd.read_csv("gene_data.csv", index_col = 0)
y = pd.read_csv("gene_labels.csv", index_col = 0)
X = X.dropna(axis='columns')

In [3]:
X = X.drop(X.var()[X.var() < 0.001].index.values, axis=1)

In [4]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
pt = PowerTransformer(method = "yeo-johnson")
X = pd.DataFrame(pt.fit_transform(X), columns = X.columns)

In [5]:
X.head()

Unnamed: 0,gene_0,gene_1,gene_2,gene_3,gene_4,gene_6,gene_7,gene_8,gene_9,gene_10,...,gene_20521,gene_20522,gene_20523,gene_20524,gene_20525,gene_20526,gene_20527,gene_20528,gene_20529,gene_20530
0,-0.213757,-0.871757,0.148834,-2.373083,1.203419,-0.228846,0.420773,-0.142766,-0.070844,0.46009,...,-1.259121,-0.9217,-0.884748,1.080779,-1.151176,0.36758,-0.911099,-1.224431,-0.02042,-0.373363
1,-0.213757,-1.797344,-1.40386,1.30077,-0.343099,-0.547056,-1.125183,-0.142766,-0.070844,-0.949409,...,-1.601084,-2.431478,-0.841433,0.513606,-2.348604,1.422809,0.585872,-0.304032,-1.872835,-0.373363
2,-0.213757,0.341956,1.16202,0.363023,0.1548,-0.409336,0.135235,-0.142766,-0.070844,-0.949409,...,-1.045556,-1.062246,2.384282,0.068805,0.275788,-3.747893,-0.707128,0.897737,-2.139582,-0.373363
3,-0.213757,0.487482,1.336004,0.012614,0.773066,0.378046,0.095305,-0.142766,-0.070844,-0.949409,...,0.146152,0.054274,0.193363,2.248039,-0.310709,-0.541223,-0.846719,0.213301,-1.128307,-0.373363
4,-0.213757,-0.39798,-0.267585,-0.191536,-0.108244,-0.765225,-0.080037,-0.142766,-0.070844,-0.949409,...,0.029564,0.21714,0.833886,1.056639,0.149942,0.24417,-1.50557,-0.186879,-0.110257,-0.373363


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 101)

In [7]:
Logit_ridge = LogisticRegressionCV(cv=10, scoring='accuracy', n_jobs=-1, multi_class = "multinomial", max_iter = 100, penalty = "l2")

In [8]:
Logit_ridge.fit(X_train, np.ravel(y_train))
y_pred = Logit_ridge.predict(X_test)

In [9]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=y.Class.unique()))

[[59  0  0  0  0]
 [ 0 16  0  0  0]
 [ 0  0 32  0  0]
 [ 0  0  0 23  0]
 [ 0  0  0  0 31]]
              precision    recall  f1-score   support

        PRAD       1.00      1.00      1.00        59
        LUAD       1.00      1.00      1.00        16
        BRCA       1.00      1.00      1.00        32
        KIRC       1.00      1.00      1.00        23
        COAD       1.00      1.00      1.00        31

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161



In [10]:
Logit_ridge.C_

array([21.5443469, 21.5443469, 21.5443469, 21.5443469, 21.5443469])

In [11]:
Logit_lasso = LogisticRegressionCV(cv=10, scoring='accuracy', n_jobs=-1, multi_class = "multinomial",solver = "saga", max_iter = 100, penalty = "l1")

In [12]:
Logit_lasso.fit(X_train, np.ravel(y_train))
y_pred = Logit_lasso.predict(X_test)



In [13]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=y.Class.unique()))

[[59  0  0  0  0]
 [ 0 16  0  0  0]
 [ 0  0 32  0  0]
 [ 0  0  0 23  0]
 [ 0  0  0  0 31]]
              precision    recall  f1-score   support

        PRAD       1.00      1.00      1.00        59
        LUAD       1.00      1.00      1.00        16
        BRCA       1.00      1.00      1.00        32
        KIRC       1.00      1.00      1.00        23
        COAD       1.00      1.00      1.00        31

    accuracy                           1.00       161
   macro avg       1.00      1.00      1.00       161
weighted avg       1.00      1.00      1.00       161



In [14]:
Logit_lasso.C_

array([0.04641589, 0.04641589, 0.04641589, 0.04641589, 0.04641589])

In [15]:
coef = pd.DataFrame(data = Logit_lasso.coef_, columns = X.columns, index = y.Class.unique())

In [16]:
coef = coef.loc[:, (coef != 0).any(axis=0)]

In [17]:
coef.iloc[[0]].iloc[:, np.argsort(-np.abs(coef.values[0]))[:20]]

Unnamed: 0,gene_7964,gene_15589,gene_17801,gene_18746,gene_6876,gene_6611,gene_14092,gene_357,gene_9652,gene_6748,gene_5578,gene_10731,gene_6530,gene_2910,gene_17905,gene_17904,gene_2747,gene_5407,gene_17993,gene_17916
PRAD,-0.119404,0.119164,-0.109951,0.086577,0.079084,-0.077042,0.072783,0.072482,0.067552,-0.06527,0.063029,-0.061582,0.061098,0.060942,0.053719,0.051754,0.051754,0.046624,0.045912,0.041146


In [18]:
coef.iloc[[1]].iloc[:, np.argsort(-np.abs(coef.values[1]))[:20]]

Unnamed: 0,gene_4805,gene_4804,gene_14845,gene_11652,gene_3813,gene_3532,gene_14821,gene_2037,gene_7747,gene_10412,gene_7010,gene_2575,gene_7554,gene_3523,gene_11222,gene_5829,gene_10460,gene_20208,gene_6487,gene_10098
LUAD,0.050317,0.048228,0.046956,0.041404,0.040591,0.039138,0.033147,0.028416,0.02795,0.027122,0.02576,0.024696,0.023992,0.023712,0.023455,0.019512,0.018354,-0.017526,0.017498,0.017084


In [19]:
coef.iloc[[2]].iloc[:, np.argsort(-np.abs(coef.values[2]))[:20]]

Unnamed: 0,gene_18214,gene_14114,gene_16131,gene_220,gene_16173,gene_4429,gene_450,gene_219,gene_16130,gene_16105,gene_16259,gene_16156,gene_13818,gene_16132,gene_18,gene_11566,gene_9979,gene_7116,gene_16246,gene_6760
BRCA,0.075516,0.075337,0.074638,0.069651,0.065259,0.052744,0.04643,0.044649,0.037189,0.034318,0.033696,0.028804,0.028328,0.026259,0.02072,0.020261,0.020062,0.019782,0.01874,0.018557


In [20]:
coef.iloc[[3]].iloc[:, np.argsort(-np.abs(coef.values[3]))[:20]]

Unnamed: 0,gene_15899,gene_15895,gene_13639,gene_15898,gene_15591,gene_15896,gene_11352,gene_11903,gene_7058,gene_15161,gene_3695,gene_15900,gene_19648,gene_19542,gene_8013,gene_15894,gene_9713,gene_17174,gene_15577,gene_11550
KIRC,0.130446,0.10961,0.083694,0.073549,0.066675,0.064456,0.049966,0.044358,0.04386,0.039772,0.039593,0.039049,0.033988,0.033963,-0.031214,0.030296,0.029975,0.028819,0.02686,0.026166


In [21]:
coef.iloc[[4]].iloc[:, np.argsort(-np.abs(coef.values[4]))[:20]]

Unnamed: 0,gene_6937,gene_12881,gene_12847,gene_18135,gene_15302,gene_13976,gene_12848,gene_11026,gene_3737,gene_9626,gene_12995,gene_9175,gene_12069,gene_13809,gene_13076,gene_15800,gene_17751,gene_9176,gene_11910,gene_14798
COAD,0.077472,0.060174,0.050396,0.047285,0.038767,0.037772,0.036033,0.034852,0.032742,0.030163,0.029425,0.029247,0.028844,0.02814,0.027216,0.024612,0.023882,0.023005,0.022546,0.022158


Above is the list of the top 20 genes for each type of tumor expressed. Genes like gene_7964 has a high negative value for PRAD tumor so it inhibits that tumor while gene_15899 has a high positive value for KIRC which means it benefits that type of tumor.