In [144]:
import pandas as pd
import scanpy as sc, scprep
import numpy as np
from scipy.sparse import csr_matrix

In [4]:
path_ds = 'raw/'
path_out = 'processed/'

In [1]:
! wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE154nnn/GSE154126/matrix/GSE154126-GPL11154_series_matrix.txt.gz
! wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE154nnn/GSE154126/matrix/GSE154126-GPL16791_series_matrix.txt.gz
! wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE154nnn/GSE154126/suppl/GSE154126_tbx-v-m-b-norm-ntv-cpms.aug.tsv.gz
! wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE154nnn/GSE154126/suppl/GSE154126_tbx-v-m-b-norm-ntv-reads.aug.tsv.gz

--2024-03-05 18:25:36--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE154nnn/GSE154126/matrix/GSE154126-GPL11154_series_matrix.txt.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.13, 130.14.250.12, 2607:f220:41e:250::10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9284 (9.1K) [application/x-gzip]
Saving to: ‘GSE154126-GPL11154_series_matrix.txt.gz’


2024-03-05 18:25:36 (99.3 MB/s) - ‘GSE154126-GPL11154_series_matrix.txt.gz’ saved [9284/9284]

--2024-03-05 18:25:37--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE154nnn/GSE154126/matrix/GSE154126-GPL16791_series_matrix.txt.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.13, 130.14.250.12, 2607:f220:41e:250::10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31874 (31K) [application/x-gzip]


In [2]:
! mv GSE154126-GPL11154_series_matrix.txt.gz raw
! mv GSE154126-GPL16791_series_matrix.txt.gz raw
! mv GSE154126_tbx-v-m-b-norm-ntv-cpms.aug.tsv.gz raw
! mv GSE154126_tbx-v-m-b-norm-ntv-reads.aug.tsv.gz raw

In [6]:
x_norm=pd.read_table(path_ds+'GSE154126_tbx-v-m-b-norm-ntv-cpms.aug.tsv.gz',index_col=0,skiprows=6)
x=pd.read_table(path_ds+'GSE154126_tbx-v-m-b-norm-ntv-reads.aug.tsv.gz',index_col=0,skiprows=6)

In [7]:
x.columns=[c.replace('cell.','') for c in x.columns]
# Subset with genes with names
x=x[~x.index.isna()]
x_norm=x_norm[~x_norm.index.isna()]

In [102]:
# Some obs data also in expression table
obs_x1=pd.read_table(path_ds+'GSE154126_tbx-v-m-b-norm-ntv-cpms.aug.tsv.gz',index_col=0,nrows=6)
obs_x2=pd.read_table(path_ds+'GSE154126_tbx-v-m-b-norm-ntv-reads.aug.tsv.gz',index_col=0,nrows=6)
print('Both obs datasets in X files are the same:',(obs_x1==obs_x2).all().all())

Both obs datasets in X files are the same: True


In [103]:
obs1=pd.read_table(path_ds+'GSE154126-GPL11154_series_matrix.txt.gz', skiprows=37,index_col=0)
obs1=obs1.drop(index=obs1.iloc[[-5]].index, inplace=False)

In [104]:
obs2=pd.read_table(path_ds+'GSE154126-GPL16791_series_matrix.txt.gz', skiprows=37,index_col=0)
obs2=obs2.drop(index=obs2.iloc[[-5]].index, inplace=False)

In [105]:
obs=pd.concat([obs1,obs2], axis=1)
print(obs.shape)

(44, 1263)


In [106]:
# Correct col names to remove donor info from cell names as it is already in the table, 
# for x matching
obs.columns=[c.split(':')[0] for c in obs.columns]
# Select columns
obs=obs.T[['!Sample_geo_accession']]
obs.columns=['geo_accession']

In [107]:
# Concat both obs datasets
# Format col names
obs_x1.loc['source_id',:]=obs_x1.columns
obs_x1.columns=obs_x1.loc['gene|cell_id',:].str.replace('cell.','',regex=False)
obs_x1=obs_x1.T.rename({'source_id':'donor'})
obs=pd.concat([obs,obs_x1],axis=1)

In [108]:
# Parse obs
# Select only some cols as others in donor df
obs=obs[['geo_accession','condition_health_status','CellType','source_id']]
obs.rename({'condition_health_status':'disease','CellType':'cell_type_original',
           'source_id':'donor'},axis=1,inplace=True)
obs['disease']=obs['disease'].replace({'Control':'control'})

In [109]:
# Parse donors
obs['donor']=obs.donor.apply(lambda x: x.split('.')[0])

In [134]:
donor_df = pd.read_excel(path_ds+'1-s2.0-S2212877820301319-mmc1.xlsx', index_col=0).iloc[:-1, :]

In [135]:
# Parse donor df
donor_df['age_group'] = donor_df['Condition'].str.replace('T2D', 'Adult')
donor_df.drop('Condition',axis=1,inplace=True) # Drop as better in other table
donor_df.rename(
    {'Age':'age','Gender':'sex','Ethnicity':'ethnicity','BMI':'BMI',
                 'Condition':'disease'},axis=1,inplace=True)
donor_df['age']=donor_df['age'].apply(lambda x: x[:-1]+' '+x[-1])
donor_df['ethnicity']=donor_df['ethnicity'
                              ].str.lower().str.replace('.','_').replace({'NA':np.nan})
donor_df.replace('NA',np.nan)

  donor_df['ethnicity']=donor_df['ethnicity'


Unnamed: 0_level_0,age,sex,ethnicity,BMI,age_group
Donor ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ICRH85,18 d,female,hispanic,19.5,Newborn
ACJW006,10 m,male,african_american,19.5,Toddler
HPAP011,3 y,male,african_american,17.6,Toddler
ICRH76,2 y,male,european_american,13.6,Toddler *
ICRH80,19 m,female,european_american,18.0,Toddler *
ICRH97,4 y,male,european_american,15.1,Toddler
ADEY348,11 y,male,european_american,17.8,Adolescent
ADFY260,12 y,female,african_american,23.26,Adolescent
AAJF122,52 y,male,asian_american,29.1,Adult *
ABAF490,39 y,female,european_american,45.2,Adult *


In [136]:
donor_df=donor_df.replace('\xa0NA',np.nan).replace('\xa0na',np.nan)

In [137]:
# Add donor to obs
for col in donor_df.columns:
    obs[col]=obs['donor'].map(donor_df[col].to_dict())

In [138]:
obs['cell_type']=obs['cell_type_original'].replace({
    'duct':'ductal','pp':'gamma','masked':'dropped'})
obs.cell_type.value_counts(dropna=False)

dropped       657
alpha         234
beta          182
ductal        100
mesenchyme     44
acinar         30
gamma          19
delta          10
Name: cell_type, dtype: int64

In [140]:
# Check that x and x_norm match
if not (x.index==x_norm.index).all() and \
    (x.columns==[c.replace('cell.','') for c in x_norm.columns]).all():
    raise ValueError('x and x_norm not matching')

In [145]:
adata=sc.AnnData(X=csr_matrix(x.T),obs=obs.reindex(x.T.index),
                layers={'normalised_original':csr_matrix(x_norm.T)},
                 var=pd.DataFrame(index=x.T.columns))

  adata=sc.AnnData(X=csr_matrix(x.T),obs=obs.reindex(x.T.index),


In [146]:
# Save orginal X
adata.layers['raw']=adata.X.copy()

In [147]:
# Sqrt normalise
sc.pp.normalize_total(adata)
sc.pp.sqrt(adata)

In [150]:
adata_subset = adata[(adata.obs['disease'].isin(['control','T2D'])) & (adata.obs['cell_type'] == 'beta')]

In [152]:
adata_subset.write('processed/GSE154126_adata.h5ad')

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
