In [23]:
import scanpy as sc, scprep
from scipy.sparse import csr_matrix
import pandas as pd

In [78]:
path_ds = 'raw/'
path_out = 'processed/'

# GSE83139

In [None]:
!wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE83nnn/GSE83139/suppl/GSE83139_tbx-v-f-norm-ntv-cpms.csv.gz
!wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE83nnn/GSE83139/matrix/GSE83139-GPL11154_series_matrix.txt.gz
!wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE83nnn/GSE83139/matrix/GSE83139-GPL16791_series_matrix.txt.gz

!gunzip GSE83139_tbx-v-f-norm-ntv-cpms.csv.gz
!gunzip GSE83139-GPL11154_series_matrix.txt.gz
!gunzip GSE83139-GPL16791_series_matrix.txt.gz

!mv GSE83139_tbx-v-f-norm-ntv-cpms.csv raw
!mv GSE83139-GPL11154_series_matrix.txt raw
!mv GSE83139-GPL16791_series_matrix.txt raw

In [1]:
! wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE83nnn/GSE83139/suppl/GSE83139_tbx-v-f-norm-ntv-cpms.csv.gz
! gunzip GSE83139_tbx-v-f-norm-ntv-cpms.csv.gz
! mv GSE83139_tbx-v-f-norm-ntv-cpms.csv raw

--2024-03-05 14:26:05--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE83nnn/GSE83139/suppl/GSE83139_tbx-v-f-norm-ntv-cpms.csv.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.11, 130.14.250.10, 2607:f220:41e:250::7, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 48224222 (46M) [application/x-gzip]
Saving to: ‘GSE83139_tbx-v-f-norm-ntv-cpms.csv.gz’


2024-03-05 14:26:07 (30.2 MB/s) - ‘GSE83139_tbx-v-f-norm-ntv-cpms.csv.gz’ saved [48224222/48224222]



In [61]:
x=pd.read_table(path_ds+'GSE83139_tbx-v-f-norm-ntv-cpms.csv')

In [62]:
# Subset to expression only, drop other gene info
x.index=x.gene
x=x[[c for c in x.columns if 'reads.' in c]]
print(x.shape)

(19950, 635)


In [63]:
# metadata
# For some reason there are 2 obs tables on GEO that each contain part of the data
obs1=pd.read_table(path_ds+'GSE83139-GPL16791_series_matrix.txt',
                  skiprows=36,index_col=0)
obs2=pd.read_table(path_ds+'GSE83139-GPL11154_series_matrix.txt',
                  skiprows=36,index_col=0)
obs=pd.concat([obs1,obs2],axis=1)
print(obs.shape)

(47, 635)


In [64]:
# Subset
obs=obs.loc[['!Sample_characteristics_ch1','!Sample_geo_accession',
         '!Sample_source_name_ch1','!Sample_organism_ch1'],:]

In [65]:
# Edit rownmaes
obs.index=['tissue','age_group','disease','cell_type','geo_accession','organ','organism']
obs.drop('age_group',inplace=True)

In [66]:
# Edit values
for row in ['tissue','disease','cell_type']:
    obs.loc[row,:]=obs.loc[row].apply(lambda x: x.split(': ')[1])
obs.loc['organism']='human'

In [67]:
# Add donor info
obs.loc['donor',:]=[c.split()[0] for c in obs.columns]

In [68]:
# Rename some donors in obs
obs.loc['donor',:]=obs.loc['donor',:].replace(
    {'HP-15085-01T2D::8dcult':'HP-15085: cultured','HP-15085-01T2D::fresh':'HP-15085'})

In [69]:
# rename obs columns to match x
# Check that removing donor information from cols still produces unique cols
print('Cols unique across donors:',len(set([c.split()[1] for c in obs.columns]))==obs.shape[1])
obs.columns=['reads.'+c.split()[1] for c in obs.columns]

Cols unique across donors: True


In [70]:
# Rename cell types
obs.loc['cell_type_original',:]=obs.loc['cell_type',:]
obs.loc['cell_type',:]=obs.loc['cell_type',:].replace(
    {'duct':'ductal','pp':'gamma'})
obs.loc['cell_type',:].unique().tolist()

['dropped',
 'alpha',
 'ductal',
 'beta',
 'gamma',
 'delta',
 'acinar',
 'mesenchyme']

In [71]:
adata=sc.AnnData(X=csr_matrix(x.T),obs=obs.T.reindex(x.T.index),
                 var=pd.DataFrame(index=x.T.columns))

  adata=sc.AnnData(X=csr_matrix(x.T),obs=obs.T.reindex(x.T.index),


In [72]:
# Save orginal X
adata.layers['normalised_original']=adata.X.copy()

In [73]:
# Sqrt normalize (to match our analysis)
sc.pp.sqrt(adata)

In [77]:
adata_subset = adata[(adata.obs['disease'].isin(['control','T2D'])) & (adata.obs['cell_type'] == 'beta')]
adata_subset.var.index = adata_subset.var.index.astype(str)

In [79]:
adata_subset.write(path_out+'GSE83139_adata.h5ad')

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c


In [85]:
adata_subset.obs[['disease', 'cell_type']].value_counts()

disease  cell_type
control  beta         67
T2D      beta         38
dtype: int64