## Importing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import plotly.express as px
import seaborn as sns
import gc
import time
from sklearn.metrics import roc_auc_score
from mpl_toolkits.mplot3d import Axes3D

## Data Loading

In [3]:
start_time = time.time()
meta_data = pd.read_csv('/content/drive/MyDrive/Multimodal_Single_Cell analysis/metadata.csv', index_col=0)
end_time = time.time()
print(end_time - start_time)
meta_data.head()

0.9138031005859375


Unnamed: 0_level_0,day,donor,cell_type,technology
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c2150f55becb,2,27678,HSC,citeseq
65b7edf8a4da,2,27678,HSC,citeseq
c1b26cb1057b,2,27678,EryP,citeseq
917168fa6f83,2,27678,NeuP,citeseq
2b29feeca86d,2,27678,EryP,citeseq


In [4]:
meta_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 281528 entries, c2150f55becb to b847ba21f59f
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   day         281528 non-null  int64 
 1   donor       281528 non-null  int64 
 2   cell_type   281528 non-null  object
 3   technology  281528 non-null  object
dtypes: int64(2), object(2)
memory usage: 10.7+ MB


In [5]:
start_time = time.time()
train = pd.read_hdf('/content/drive/MyDrive/Multimodal_Single_Cell analysis/train_multi_targets.h5', stop=33500)
end_time = time.time()
print(end_time - start_time)

27.039104461669922


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33500 entries, 56390cf1b95e to 0f1b411a93f8
Columns: 23418 entries, ENSG00000121410 to ENSG00000074755
dtypes: float32(23418)
memory usage: 2.9+ GB


In [7]:
train = pd.merge(meta_data, train, how='inner', on=['cell_id'])
train.head()

Unnamed: 0_level_0,day,donor,cell_type,technology,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,...,ENSG00000086827,ENSG00000174442,ENSG00000122952,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56390cf1b95e,2,32606,NeuP,multiome,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.893861,0.0,0.0,0.0,0.0,5.583255,0.0,4.893861
fc0c60183c33,2,32606,HSC,multiome,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9b4a87e22ad0,2,32606,MasP,multiome,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.107832,0.0,0.0,0.0,0.0,0.0,0.0,5.107832
81cccad8cd81,2,32606,HSC,multiome,0.0,4.507936,0.0,0.0,0.0,0.0,...,0.0,5.195558,4.507936,0.0,0.0,0.0,0.0,0.0,0.0,5.195558
15cb3d85c232,2,32606,MkP,multiome,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.531572,0.0,0.0,4.842377,0.0


In [8]:
columns_to_drop = ['day', 'donor', 'technology', 'cell_type']
X = train.drop(columns=columns_to_drop, axis=1)
X.head()

Unnamed: 0_level_0,ENSG00000121410,ENSG00000268895,ENSG00000175899,ENSG00000245105,ENSG00000166535,ENSG00000256661,ENSG00000184389,ENSG00000128274,ENSG00000094914,ENSG00000081760,...,ENSG00000086827,ENSG00000174442,ENSG00000122952,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000159840,ENSG00000074755
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56390cf1b95e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.893861,0.0,0.0,0.0,0.0,5.583255,0.0,4.893861
fc0c60183c33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9b4a87e22ad0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.107832,0.0,0.0,0.0,0.0,0.0,0.0,5.107832
81cccad8cd81,0.0,4.507936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.195558,4.507936,0.0,0.0,0.0,0.0,0.0,0.0,5.195558
15cb3d85c232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.531572,0.0,0.0,4.842377,0.0


In [9]:
y = pd.DataFrame()
y["cell_type"] = train["cell_type"]
y.head()

Unnamed: 0_level_0,cell_type
cell_id,Unnamed: 1_level_1
56390cf1b95e,NeuP
fc0c60183c33,HSC
9b4a87e22ad0,MasP
81cccad8cd81,HSC
15cb3d85c232,MkP


## Data Preprocessing

In [10]:
def label_processing(y):
  le = LabelEncoder()
  y = le.fit_transform(y.iloc[:,0])
  y = pd.DataFrame(y)
  return y.values.ravel()

In [11]:
scaler = StandardScaler()
pca = PCA()
normalized_data = scaler.fit_transform(X)

In [12]:
def pca_retain(X, variance_retained, pca):
    pca_df = pca.fit_transform(normalized_data)
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    num_components = np.argmax(cumulative_variance_ratio >= variance_retained) + 1
    return (pca_df, num_components)

In [13]:
def perform_pca_test(data, variance_retained, pca):
    normalized_data = scaler.transform(X)
    pca_df = pca.transform(normalized_data)
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    num_components = np.argmax(cumulative_variance_ratio >= variance_retained) + 1
    return (pca_df, num_components)

In [14]:
X1 = X.copy()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=True, random_state=0)

In [16]:
p1 = pca_retain(X_train, 0.95, pca)

In [17]:
p1

(array([[-1.17878711e+00,  5.04373264e+00, -7.90772533e+00, ...,
         -5.57377966e-09,  1.74165145e-08, -9.21673626e-10],
        [-3.90119767e+00,  3.69203644e+01,  5.80516696e-01, ...,
         -4.59340619e-08,  1.38346827e-08, -7.41839523e-09],
        [-8.61908150e+00,  9.20693398e+00, -5.34479570e+00, ...,
          3.64984878e-08,  9.65715952e-09, -2.53974863e-09],
        ...,
        [-3.16886597e+01, -5.97044373e+00,  1.69243374e+01, ...,
         -6.10136941e-10, -1.82935675e-08, -7.68141106e-09],
        [ 3.10927629e-01, -5.07368660e+00,  1.34723005e+01, ...,
         -3.12937765e-08, -2.53301575e-08,  4.92942975e-09],
        [-3.05528336e+01, -1.33207226e+01,  4.95425749e+00, ...,
         -7.98290933e-10,  3.03029637e-08, -1.03852434e-10]], dtype=float32),
 15562)

In [18]:
p1[0].shape

(33500, 23418)

In [19]:
p2 = perform_pca_test(X_test, 0.95, pca)

In [20]:
p2

(array([[-1.1788106e+00,  5.0437369e+00, -7.9076881e+00, ...,
          4.2135346e-07,  2.0758495e-07, -6.5928980e-08],
        [-3.9016786e+00,  3.6920269e+01,  5.8039653e-01, ...,
          3.5383820e-07, -1.1006994e-07,  1.3029312e-07],
        [-8.6191444e+00,  9.2070255e+00, -5.3449097e+00, ...,
         -2.9799833e-07,  2.5854354e-07, -1.2909206e-07],
        ...,
        [-3.1688713e+01, -5.9704709e+00,  1.6924349e+01, ...,
         -2.0928732e-07, -8.6052358e-07, -8.9369735e-07],
        [ 3.1092373e-01, -5.0736761e+00,  1.3472313e+01, ...,
         -7.6663508e-07,  1.1864896e-06,  3.7962597e-08],
        [-3.0552797e+01, -1.3320746e+01,  4.9542484e+00, ...,
         -9.8154749e-07,  4.2786175e-07,  2.4744017e-07]], dtype=float32),
 15562)