In [1]:
# Import packages
import numpy as np
import pandas as pd
import scanpy as sc
import os as os
import gc
from sklearn.preprocessing import MinMaxScaler

# Generating tensors for test

In [75]:
file_list = os.listdir()
non_samples = []
for i in file_list:
    if '_bulk.csv' not in i :
        non_samples.append(i)
for i in non_samples:
    file_list.remove(i)

In [78]:
aggregate = sc.read('/home/aimmunelab/lab_members/Harris/COVID_Project/Datasets_selected/aggregate_annotated.h5ad')

In [79]:
aggregate.obs['batch']

AAACCTGAGCTAGGCA-1-0-0-0    Healthy 0
AAACCTGAGCTCCTTC-1-0-0-0    Healthy 0
AAACCTGAGGGCATGT-1-0-0-0    Healthy 0
AAACCTGAGGGTCGAT-1-0-0-0    Healthy 0
AAACCTGAGTACGACG-1-0-0-0    Healthy 0
                              ...    
GAATCCGCCCTG-6-2-2          Severe 13
CAGCGTGTGGCA-6-2-2          Severe 13
GCAGCTTCCAGA-6-2-2          Severe 13
TCGCAGCGCTCA-6-2-2          Severe 13
GCATTGGATTTG-6-2-2          Severe 13
Name: batch, Length: 259339, dtype: category
Categories (45, object): ['Healthy 0', 'Healthy 1', 'Healthy 2', 'Healthy 3', ..., 'Severe 9', 'Severe 10', 'Severe 11', 'Severe 13']

In [80]:
clusters = ['CD4+ T cells', 'Monocytes', 'NK cells', 'B cells', 'CD8+ T cells','Platelets', 'DC cells']

In [82]:
def CalculateProportion(adata,clusters=clusters):
    proportion = float
    proportion_ls = np.array([])
    for i in clusters:
        proportion = len(adata[adata.obs['leiden']==i])/len(adata)
        proportion_ls = np.append(proportion_ls, proportion)
    proportion_df = pd.DataFrame({'Cluster name':clusters,'Proportion':proportion_ls})
    return proportion_df

In [83]:
Xs_physical = []
ys_physical = []
scaler = MinMaxScaler()
for batch in aggregate.obs['batch'].cat.categories:
    sample = pd.read_csv(batch+'_bulk.csv', index_col='Unnamed: 0')
    Xs_physical.append(scaler.fit_transform(np.log(sample['Counts'].values+1).reshape(-1,1)).flatten())
    ys_physical.append(CalculateProportion(aggregate[aggregate.obs['batch']==batch])['Proportion'].values)

In [84]:
np.save('Xs_physical', Xs_physical )
np.save('ys_physical', ys_physical)

In [85]:
Xs = np.load('Xs_physical.npy')
ys = np.load('ys_physical.npy')

In [None]:
aggregate.obs['batch'].cat.categories

# Creating CIBERSORTx input file

In [86]:
csinput = pd.DataFrame({'Gene name':aggregate.var_names})

In [87]:
for batch in aggregate.obs['batch'].cat.categories:
    sample = pd.read_csv(batch+'_bulk.csv', index_col='Unnamed: 0')
    csinput[batch] = sample['Counts']

In [88]:
csinput.to_csv('csinput.csv')

In [89]:
csinput.drop_duplicates('Gene name')

Unnamed: 0,Gene name,Healthy 0,Healthy 1,Healthy 2,Healthy 3,Healthy 4,Healthy 5,Healthy 6,Healthy 7,Healthy 8,...,Severe 2,Severe 3,Severe 4,Severe 5,Severe 6,Severe 8,Severe 9,Severe 10,Severe 11,Severe 13
0,B9D2,870.956900,479.212280,375.919460,322.339020,1060.115600,830.179400,1391.936600,1402.102200,1351.404900,...,942.786560,1245.169400,1538.445400,2589.838100,1305.933300,145.248050,365.226140,560.518600,818.448360,295.262400
1,JAG2,26.732224,38.045940,57.152786,26.860876,115.020740,6.994103,21.340445,16.096037,71.978430,...,13.075047,6.079997,1.584786,9.521463,129.846730,167.842930,125.011310,210.621570,77.485740,24.086374
2,DNASE1,547.457950,214.269120,172.736450,169.100000,728.115000,989.992100,307.095950,752.451000,819.551100,...,846.918700,427.077100,524.610900,1160.954300,1849.774300,501.463320,1044.607800,1569.110600,3163.216300,726.067440
3,TRMU,1745.638100,754.944950,711.156250,749.324700,2067.019300,1594.928100,936.032200,1638.733800,2144.557900,...,1412.561600,491.825530,294.459350,931.492900,1416.563100,654.631400,1055.183700,1369.009300,1816.007000,357.593380
4,RRP9,1336.932300,395.145300,355.701750,394.111450,1175.714200,1135.620000,588.719060,1262.295300,1314.043800,...,1274.152000,374.227600,257.379580,424.106380,591.926700,213.004790,303.131400,465.784600,598.302730,100.944180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17201,FOXF1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,6.053269,55.147060,106.586530,53.347794,12.770911,3.667851
17202,AC092316.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,13.286356,32.467533,0.000000,6.407103,12.072504,3.956207
17203,FOXP1-AS1,0.000000,4.932404,2.994012,0.000000,7.383367,1.215362,20.663567,1.344448,37.671436,...,5.707495,0.000000,0.000000,20.079092,45.861570,52.016838,77.099270,52.781906,67.932620,13.598816
17204,ANKRD61,1.126380,0.000000,0.000000,0.000000,0.000000,0.000000,2.496256,0.000000,0.000000,...,0.000000,0.000000,0.000000,2.981515,48.745117,0.000000,54.600574,6.884032,35.784786,1.397819


In [49]:
csinput['Gene name'][280:291]

280     NCOR1
281     RAB31
282     PSMC2
283      RPS6
284      CLUH
285     CCND2
286     RPL14
287    RPS27L
288     GAPDH
289     RPS20
290     SEPT8
Name: Gene name, dtype: object

In [90]:
csinput.to_csv('csinput.txt', sep='\t',index=None)

In [93]:
pd.read_table('csinput.txt').columns

Index(['Gene name', 'Healthy 0', 'Healthy 1', 'Healthy 2', 'Healthy 3',
       'Healthy 4', 'Healthy 5', 'Healthy 6', 'Healthy 7', 'Healthy 8',
       'Healthy 9', 'Healthy 10', 'Healthy 11', 'Healthy 12', 'Healthy 14',
       'Healthy 15', 'Healthy 17', 'Healthy 18', 'Healthy 20', 'Healthy 21',
       'Healthy 23', 'Healthy 24', 'Healthy 25', 'Healthy 26', 'Moderate 0',
       'Moderate 1', 'Moderate 2', 'Moderate 3', 'Moderate 4', 'Moderate 5',
       'Moderate 8', 'Moderate 11', 'Moderate 13', 'Moderate 14', 'Severe 0',
       'Severe 1', 'Severe 2', 'Severe 3', 'Severe 4', 'Severe 5', 'Severe 6',
       'Severe 8', 'Severe 9', 'Severe 10', 'Severe 11', 'Severe 13'],
      dtype='object')

# Design 1 on 1 experiments for DE Analysis

In [94]:
hm = []
hs = []
ms = []
for i in csinput.columns:
    if 'Healthy' in i:
        hm.append(i)
        hs.append(i)
    if 'Moderate' in i:
        hm.append(i)
        ms.append(i)
    if 'Severe' in i:
        hs.append(i)
        ms.append(i)
hm_design = csinput[['Gene name']+hm]
hs_design = csinput[['Gene name']+hs]
ms_design = csinput[['Gene name']+ms]

In [95]:
hm_design.to_csv('hm_design.csv')
hs_design.to_csv('hs_design.csv')
ms_design.to_csv('ms_design.csv')