# Importing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import plotly.express as px
import seaborn as sns
import gc
import time
from sklearn.metrics import roc_auc_score
from mpl_toolkits.mplot3d import Axes3D

# Data Loading

In [3]:
start_time = time.time()
meta_data = pd.read_csv('/content/drive/MyDrive/Multimodal_Single_Cell analysis/metadata.csv', index_col=0)
end_time = time.time()
print(end_time - start_time)
meta_data.head()

0.9408843517303467


Unnamed: 0_level_0,day,donor,cell_type,technology
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c2150f55becb,2,27678,HSC,citeseq
65b7edf8a4da,2,27678,HSC,citeseq
c1b26cb1057b,2,27678,EryP,citeseq
917168fa6f83,2,27678,NeuP,citeseq
2b29feeca86d,2,27678,EryP,citeseq


In [4]:
meta_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 281528 entries, c2150f55becb to b847ba21f59f
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   day         281528 non-null  int64 
 1   donor       281528 non-null  int64 
 2   cell_type   281528 non-null  object
 3   technology  281528 non-null  object
dtypes: int64(2), object(2)
memory usage: 10.7+ MB


In [5]:
start_time = time.time()
train = pd.read_hdf('/content/drive/MyDrive/Multimodal_Single_Cell analysis/train_cite_inputs.h5', stop=49000)
end_time = time.time()
print(end_time - start_time)

25.050884246826172


In [6]:
train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 49000 entries, 45006fe3e4c8 to fa6480f51b74
Columns: 22050 entries, ENSG00000121410_A1BG to ENSG00000074755_ZZEF1
dtypes: float32(22050)
memory usage: 4.0+ GB


In [7]:
train = pd.merge(meta_data, train, how='inner', on=['cell_id'])
train.head()

Unnamed: 0_level_0,day,donor,cell_type,technology,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,2,32606,HSC,citeseq,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.090185,0.0
d02759a80ba2,2,32606,HSC,citeseq,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.039545,0.0,0.0,0.0,0.0,0.0,0.0
c016c6b0efa5,2,32606,EryP,citeseq,0.0,0.0,0.0,0.0,0.0,3.847321,...,0.0,0.0,3.847321,4.529743,0.0,0.0,0.0,3.847321,3.847321,0.0
ba7f733a4f75,2,32606,NeuP,citeseq,0.0,0.0,0.0,0.0,0.0,0.0,...,3.436846,0.0,4.11378,5.020215,0.0,0.0,0.0,3.436846,4.11378,0.0
fbcf2443ffb2,2,32606,EryP,citeseq,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.196826,4.196826,4.196826,0.0,0.0,3.51861,4.196826,3.51861,0.0


In [8]:
columns_to_drop = ['day', 'donor', 'technology', 'cell_type']
X = train.drop(columns=columns_to_drop, axis=1)
X.head()

Unnamed: 0_level_0,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.090185,0.0
d02759a80ba2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.039545,0.0,0.0,0.0,0.0,0.0,0.0
c016c6b0efa5,0.0,0.0,0.0,0.0,0.0,3.847321,0.0,3.847321,3.847321,0.0,...,0.0,0.0,3.847321,4.529743,0.0,0.0,0.0,3.847321,3.847321,0.0
ba7f733a4f75,0.0,0.0,0.0,0.0,0.0,0.0,3.436846,3.436846,0.0,0.0,...,3.436846,0.0,4.11378,5.020215,0.0,0.0,0.0,3.436846,4.11378,0.0
fbcf2443ffb2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.196826,0.0,0.0,...,0.0,4.196826,4.196826,4.196826,0.0,0.0,3.51861,4.196826,3.51861,0.0


In [9]:
y = pd.DataFrame()
y["cell_type"] = train["cell_type"]
y.head()

Unnamed: 0_level_0,cell_type
cell_id,Unnamed: 1_level_1
45006fe3e4c8,HSC
d02759a80ba2,HSC
c016c6b0efa5,EryP
ba7f733a4f75,NeuP
fbcf2443ffb2,EryP


# Data Preprocessing

In [10]:
def label_processing(y):
  le = LabelEncoder()
  y = le.fit_transform(y.iloc[:,0])
  y = pd.DataFrame(y)
  return y.values.ravel()

In [11]:
scaler = StandardScaler()
pca = PCA()
normalized_data = scaler.fit_transform(X)

In [12]:
def pca_retain(X, variance_retained, pca):
    pca_df = pca.fit_transform(normalized_data)
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    num_components = np.argmax(cumulative_variance_ratio >= variance_retained) + 1
    return (pca_df, num_components)

In [13]:
def perform_pca_test(data, variance_retained, pca):
    normalized_data = scaler.transform(X)
    pca_df = pca.transform(normalized_data)
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    num_components = np.argmax(cumulative_variance_ratio >= variance_retained) + 1
    return (pca_df, num_components)

In [14]:
X1 = X.copy()

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=True, random_state=0)

In [16]:
p1 = pca_retain(X_train, 0.95, pca)

In [17]:
p1

(array([[ 1.6624281e+01, -1.9964724e+01, -5.8061595e+00, ...,
          3.9083545e-08, -1.6265403e-08, -7.2928472e-08],
        [ 2.0844492e+01, -2.0761698e+01, -7.0638394e+00, ...,
          2.1156371e-08, -6.4261684e-08,  3.6559360e-09],
        [ 3.0793233e+00,  1.4615282e-01,  1.7954473e+01, ...,
         -2.5720645e-08,  2.7647379e-08, -3.4389721e-08],
        ...,
        [ 3.7259876e+01, -1.7086647e+01, -4.9387240e+00, ...,
          3.9095614e-08, -5.9651484e-09, -2.5063818e-09],
        [-2.5999821e+01, -6.4424901e+00,  1.0782765e+01, ...,
         -2.7497091e-09,  6.4090557e-08, -3.5175745e-08],
        [-4.1816244e+00, -1.2184581e+01, -9.6341105e+00, ...,
         -2.6041256e-08,  3.1438475e-08,  2.0665443e-08]], dtype=float32),
 16777)

In [18]:
p2 = perform_pca_test(X_test, 0.95, pca)

In [19]:
p2

(array([[ 1.6624207e+01, -1.9964739e+01, -5.8061275e+00, ...,
          1.6851153e-07, -1.8046632e-06, -1.0098590e-06],
        [ 2.0844580e+01, -2.0761656e+01, -7.0637894e+00, ...,
         -3.6714380e-07, -7.6403342e-07,  1.6718519e-07],
        [ 3.0792797e+00,  1.4614485e-01,  1.7954491e+01, ...,
         -2.5670538e-06, -2.2559962e-06, -2.1176004e-06],
        ...,
        [ 3.7259899e+01, -1.7086658e+01, -4.9387259e+00, ...,
         -2.6527894e-06, -9.7992461e-07, -1.5029714e-06],
        [-2.5999823e+01, -6.4424906e+00,  1.0782777e+01, ...,
          1.0852625e-06,  2.6111325e-06, -7.3725687e-08],
        [-4.1816111e+00, -1.2184568e+01, -9.6341219e+00, ...,
          1.2977411e-06,  2.0456050e-06,  1.7293756e-06]], dtype=float32),
 16777)