In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:

# Load the npy file
embeddings = np.load('Merged_data_01B-resolution_singlecell_cell_embedding_f1_resolution_no_read_depth_enh.npy')

# Access the data
embeddings.shape # (n_cells, n_features)

(25066, 3072)

In [52]:

cell_metadata = pd.read_csv('../patient_metadata.csv')
cell_metadata.head()

Unnamed: 0,cell_id,nCount_RNA,nFeature_RNA,patient_id,percent_mito,Response_3m,sample_source
0,Good_Li_2023ac01_AAACGGGAGATGTGTA-1,652,433,ac01,3.93586,1,Deng
1,Good_Li_2023ac01_AAAGATGCAGCCTTGG-1,18106,3790,ac01,2.465065,1,Deng
2,Good_Li_2023ac01_AAAGTAGTCATGGTCA-1,1145,688,ac01,3.858785,1,Deng
3,Good_Li_2023ac01_AAATGCCAGTACCGGA-1,3430,1413,ac01,2.677339,1,Deng
4,Good_Li_2023ac01_AAATGCCGTTCAGACT-1,924,554,ac01,1.731161,1,Deng


In [53]:
embeddings = pd.DataFrame(embeddings, index=cell_metadata.index)


In [56]:
cell_metadata.Response_3m.value_counts() # 0: 'NR', 1: 'OR'


Response_3m
1    12220
0    11906
Name: count, dtype: int64

In [55]:
# remove Sheih dataset from the metadata
cell_metadata = cell_metadata[cell_metadata['sample_source'] != 'Sheih']
embeddings = embeddings.iloc[cell_metadata.index]

In [57]:
# method 1: randomly spliting
def train_valid_split_by_patients(label_df, seed_num, split_ratio):
    patient_list = label_df["patient_id"].unique()
    np.random.seed(seed_num)
    reference_patients = np.random.choice(patient_list, size=int(len(patient_list) * split_ratio), replace=False)
    train_data = label_df[label_df['patient_id'].isin(reference_patients)]   
    valid_data = label_df[~label_df['patient_id'].isin(reference_patients)]  
    return train_data, valid_data

In [58]:
def DataPreparation(embeddings, cell_metadata, split_ratio=0.8, seed_num=42, label_encoder=None):
    le = LabelEncoder()
    train_data, valid_data = train_valid_split_by_patients(cell_metadata, split_ratio=split_ratio, seed_num=seed_num)
    train_patients = train_data.patient_id
    valid_patients = valid_data.patient_id
    X_df = pd.DataFrame(embeddings, index=cell_metadata.index) # create a df for embeddings and add index
    X_train = X_df.iloc[train_data.index] # subset the embedding according to the train and valid data
    X_valid = X_df.iloc[valid_data.index] 

    y_train = le.fit_transform(train_data.Response_3m)
    y_valid = le.fit_transform(valid_data.Response_3m)

    return X_train, X_valid, y_train, y_valid, train_patients, valid_patients

In [59]:
X_train, X_valid, y_train, y_valid, train_patients, test_patients = DataPreparation(embeddings, cell_metadata, split_ratio=0.8, seed_num=6)

In [60]:
test_patients.unique()

array(['ac02', 'ac03', 'ac07', 'ac10', 'ac12', 'ac17', 'ac27', 'ac28',
       'ac34', 'ac38', '11', '16', '22', '25', '26', '27'], dtype=object)

In [61]:
train_patients.unique()

array(['ac01', 'ac04', 'ac05', 'ac08', 'ac09', 'ac11', 'ac13', 'ac14',
       'ac15', 'ac16', 'ac18', 'ac19', 'ac20', 'ac21', 'ac22', 'ac23',
       'ac24', 'ac25', 'ac26', 'ac29', 'ac30', 'ac32', 'ac33', 'ac37',
       'ac39', 'Pt010', 'Pt011', 'Pt015', 'Pt016', 'Pt025', 'Pt031',
       'Pt237', 'Pt245', 'Pt253', 'Pt263', 'Pt276', 'Pt282', 'Pt375',
       '01', '02', '03', '04', '05', '07', '08', '09', '10', '12', '13',
       '14', '15', '17', '18', '19', '20', '21', '23', '24', '28', '29',
       '30', '31', '32'], dtype=object)

In [7]:
# encode the labels
# y = cell_metadata.Response_3m
# le = LabelEncoder()
# y_encoded = le.fit_transform(y)

In [62]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

In [63]:
# evaluate the model split by patients
y_pred = classifier.predict(X_valid)
print(f"Accuracy score: {accuracy_score(y_valid, y_pred)}")
print(classification_report(y_valid, y_pred))

Accuracy score: 0.41608040201005025
              precision    recall  f1-score   support

           0       0.37      0.38      0.38      2285
           1       0.46      0.44      0.45      2690

    accuracy                           0.42      4975
   macro avg       0.41      0.41      0.41      4975
weighted avg       0.42      0.42      0.42      4975



In [10]:
# evaluate the model split by cell
y_pred = classifier.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy score: 0.7905863581970483
              precision    recall  f1-score   support

           0       0.79      0.76      0.78      2400
           1       0.79      0.82      0.80      2614

    accuracy                           0.79      5014
   macro avg       0.79      0.79      0.79      5014
weighted avg       0.79      0.79      0.79      5014



In [18]:
# evaluate the model

y_pred = classifier.predict(X_test)
# confusion matrix
print(confusion_matrix(y_test, y_pred))

[[1822  578]
 [ 472 2142]]
