In [3]:
import scanpy as sc, scprep
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np

In [5]:
path_ds = 'raw/'
path_out = 'processed/'

In [2]:
!wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE86nnn/GSE86469/matrix/GSE86469_series_matrix.txt.gz
!wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE86nnn/GSE86469/suppl/GSE86469_GEO.islet.single.cell.processed.data.RSEM.raw.expected.counts.csv.gz

!mv GSE86469_series_matrix.txt.gz raw
!mv GSE86469_GEO.islet.single.cell.processed.data.RSEM.raw.expected.counts.csv.gz raw

--2024-03-05 21:27:35--  ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE86nnn/GSE86469/matrix/GSE86469_series_matrix.txt.gz
           => ‘GSE86469_series_matrix.txt.gz’
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.12, 130.14.250.11, 2607:f220:41e:250::12, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.12|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /geo/series/GSE86nnn/GSE86469/matrix ... done.
==> SIZE GSE86469_series_matrix.txt.gz ... 38497
==> PASV ... done.    ==> RETR GSE86469_series_matrix.txt.gz ... done.
Length: 38497 (38K) (unauthoritative)


2024-03-05 21:27:35 (503 KB/s) - ‘GSE86469_series_matrix.txt.gz’ saved [38497]

--2024-03-05 21:27:35--  ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE86nnn/GSE86469/suppl/GSE86469_GEO.islet.single.cell.processed.data.RSEM.raw.expected.counts.csv.gz
           => ‘GSE86469_GEO.islet.single.cell.processed.data.RSEM.r

In [43]:
obs=pd.read_table(path_ds+'GSE86469_series_matrix.txt.gz',
                  skiprows=38,index_col=0)

In [44]:
# Parse obs
# Transpose and select cols
obs=obs.T[['!Sample_geo_accession','!Sample_organism_ch1',
           '!Sample_characteristics_ch1']]

In [45]:
# Rename and subset cols
obs.columns=['geo_accession','DROP','cell_type_original','DROP',
             'sex','disease','age','ethnicity','BMI','donor']
obs.drop('DROP',axis=1,inplace=True)

In [46]:
obs['cell_type_original']=obs['cell_type_original'].str.replace('cell type: ','')
obs['sex']=obs['sex'].str.replace('Sex: ','').str.lower()
obs['disease']=obs['disease'].str.replace('disease: ','').map(
    {'Type 2 Diabetic':'T2D','Non-Diabetic':'control'})
obs['age']=obs['age'].str.replace('age: ','').apply( lambda x:str(x)+' y')
obs['ethnicity']=obs['ethnicity'].str.replace('race: ','').str.lower().str.replace(' ','_')
obs['BMI']=obs['BMI'].str.replace('bmi: ','')
obs['donor']=obs['donor'].str.replace('islet unos id: ','')

In [47]:
obs['cell_type']=obs['cell_type_original'].str.lower().replace({
    'none/other':'dropped','gamma/pp':'gamma'})
obs['cell_type'].value_counts(dropna=False)

beta        264
alpha       239
ductal       28
delta        25
acinar       24
dropped      21
stellate     19
gamma        18
Name: cell_type, dtype: int64

In [48]:
# Add donor info
donor_df=pd.read_excel(path_ds+'GSE86469_Supplemental_Table_S1.xlsx',skiprows=2,index_col=1)

  warn(msg)


In [49]:
# Parse donor_df
# Subset to cols not in obs
donor_df=donor_df[['Race','On Diabetes Medication?','HbA1c']]
donor_df.columns=['ethnicity','medication','HbA1c']
donor_df['ethnicity']=donor_df['ethnicity'].map(
    {'W':'white','AA':'african_american','H':'hispanic'})
donor_df['medication']=donor_df['medication'].str.lower()

In [50]:
# Add donor info to obs
for col in donor_df.columns:
    obs[col]=obs['donor'].map(donor_df[col].to_dict())

In [51]:
x=pd.read_table(path_ds+'GSE86469_GEO.islet.single.cell.processed.data.RSEM.raw.expected.counts.csv.gz',
               sep=',',index_col=0).T

In [52]:
adata=sc.AnnData(X=csr_matrix(x),obs=obs,
                 var=pd.DataFrame(index=x.columns))

  adata=sc.AnnData(X=csr_matrix(x),obs=obs,


In [53]:
genes = pd.read_csv(path_ds+'ensembl_convert.txt', sep='\t', header=None, index_col=0)

In [54]:
genes

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
ENSG00000229483,LINC00362
ENSG00000232849,LINC00363
ENSG00000229558,SACS-AS1
ENSG00000232977,LINC00327
ENSG00000227893,LINC00352
...,...
ENSG00000232746,LINC02022
ENSG00000150867,PIP4K2A
ENSG00000255021,AC093496.1
ENSG00000251576,LINC01267


In [55]:
# Edit var
adata.var['EID']=adata.var_names
adata.var_names=genes.loc[x.columns, 1]

In [56]:
# Save orginal X
adata.layers['raw']=adata.X.copy()

In [57]:
# Sqrt normalise
sc.pp.normalize_total(adata)
sc.pp.sqrt(adata)

In [61]:
adata_subset = adata[(adata.obs['disease'].isin(['control','T2D'])) & (adata.obs['cell_type'] == 'beta')]
adata_subset.var.index = adata_subset.var.index.astype(str)

In [66]:
adata_subset.write(path_out+'GSE86469_adata.h5ad')

In [67]:
adata_subset.obs[['disease', 'cell_type']].value_counts()

disease  cell_type
control  beta         168
T2D      beta          96
dtype: int64