# Processing of Kim et al dataset
Pre-processing of treatment-naive epithelial tumor cells from [Kim et al 2020 Nature Communications].

In [None]:
import os, sys, gc, scanpy
import numpy as np
import pandas as pd
import seaborn as sns
from anndata import AnnData
import matplotlib.pyplot as plt
%config IPCompleter.use_jedi = False

figure_folder = './figures/Kim_et_al_2020'

## Data loading

In [None]:
data_folder = '../data/Kim/raw/'
summary_file = 'GSE131907_Lung_Cancer_Feature_Summary.xlsx'
annot_file = 'GSE131907_Lung_Cancer_cell_annotation.txt'

In [None]:
sample_summary_df = pd.read_excel('%s/%s'%(data_folder, summary_file), header=2, index_col=0)
sample_summary_df = sample_summary_df.iloc[:-1]

annot_df = pd.read_csv('%s/%s'%(data_folder, annot_file), sep='\t')

## Cell types

In [None]:
plot_df = annot_df.groupby('Cell_type').agg('count').sort_values('Index', ascending=True)
plot_df.reset_index().plot.bar(x='Cell_type', y='Index', figsize=(6,4))
plt.yticks(fontsize=15, color='black')
plt.xticks(fontsize=15, color='black', rotation=90)
plt.xlabel('')
plt.ylabel('')
plt.legend([])
plt.title('Number of cell per cell type', fontsize=20, color='black')
plt.tight_layout()
plt.savefig('%s/cell_type_decomposition.png'%(figure_folder), dpi=300)

cell_type_order = np.array(plot_df.index).astype(str)
del plot_df

## Sample_Origin

In [None]:
plot_df = annot_df.groupby('Sample_Origin').agg('count').sort_values('Index', ascending=True)
plot_df.reset_index().plot.bar(x='Sample_Origin', y='Index', figsize=(6,4))
plt.yticks(fontsize=15, color='black')
plt.xticks(fontsize=15, color='black', rotation=90)
plt.xlabel('')
plt.ylabel('')
plt.legend([])
plt.title('Number of cell per origin', fontsize=20, color='black')
plt.tight_layout()
plt.savefig('%s/sample_origin_decomposition.png'%(figure_folder), dpi=300)

del plot_df

## General statistics

In [None]:
print('%s distinct cells'%(annot_df.shape[0]))
print('%s different patients'%(np.unique(annot_df['Sample']).shape[0]))
print('%s epithelial cells'%(annot_df[annot_df['Cell_type'] == 'Epithelial cells'].shape[0]))

## Origin and cell type

In [None]:
plot_df = annot_df.groupby(['Cell_type', 'Sample_Origin']).agg('count').sort_values('Index', ascending=True)
plot_df = plot_df.reset_index()
plot_df = plot_df[['Cell_type', 'Sample_Origin', 'Barcode']].pivot_table(index='Cell_type',
                                                                         columns='Sample_Origin', 
                                                                         values='Barcode')
plot_df = plot_df.fillna(0)
plot_df = plot_df.loc[cell_type_order[::-1]]

plot_df.plot(kind='bar', stacked=True, figsize=(10,6))
plt.legend(fontsize=15, ncol=2)
plt.xticks(fontsize=20, color='black')
plt.yticks(fontsize=15, color='black')
plt.xlabel('')
plt.ylabel('Number of cells', fontsize=20, color='black')
plt.tight_layout()
plt.savefig('%s/cell_type_origin_breakdown.png'%(figure_folder), dpi=300)

del plot_df

## Save UMI matrix by pickle
We here slide and pickle the huge data to facilitate downstream tasks.
### Chunk and pickle

In [None]:
data_file = 'GSE131907_Lung_Cancer_raw_UMI_matrix.txt'
intermediate_folder = '../data/Kim/tmp/'

In [None]:
data_df = []
chunksize = 5000
i = 0

with pd.read_csv('%s/%s'%(data_folder,data_file), chunksize=chunksize, sep='\t') as reader:
    for chunk in reader:
        print('ITER %s'%(i), flush=True)
        chunk.to_pickle('%s/chunk_%s.pkl'%(intermediate_folder, i),
                        compression='gzip')
        i += 1

### Load pickled data

In [None]:
data_df = []
for f in os.listdir(intermediate_folder):
    print('START %s'%(f))
    data_df.append(pd.read_pickle(
        '%s/%s'%(intermediate_folder, f), 
        compression='gzip'
    ))

print('TRANSPOSE')
data_df = [df.set_index('Index') for df in data_df]

print('CONCAT')
data_df = pd.concat(data_df, axis=0)
gc.collect()

## Tumor cell specific
### Restrict data

In [None]:
ct_annot_df = annot_df[annot_df['Cell_type'] == 'Epithelial cells']
ct_samples = ct_annot_df['Index'].values.astype(str)

print('SAMPLE FILTERING', flush=True)
tumor_df = data_df[ct_samples].T

print('GENE FILTERING', flush=True)
non_zero_genes = tumor_df.columns[np.where(np.sum(tumor_df, axis=0) > 0)]
tumor_df = tumor_df[non_zero_genes]

### Gene filtering

### Protein coding

In [None]:
gene_lookup_df = pd.read_csv(
    '../data/genes/pybiomart_gene_status.csv', 
    sep='\t', index_col=0
)
gene_lookup_df = gene_lookup_df[['Hugo', 'chromosome_name', 'status']].drop_duplicates()

protein_coding_df = gene_lookup_df[gene_lookup_df['status'] == 'protein_coding']
print('%s protein coding genes from pybiomart'%(protein_coding_df.shape[0]))

### Non-mitochondrial

In [None]:
chromosome = np.concatenate([np.arange(1,23).astype(str), ['X', 'Y']])
non_mitochondrial_df = gene_lookup_df[gene_lookup_df['chromosome_name'].isin(chromosome)]
mitochondrial_df = gene_lookup_df[gene_lookup_df['chromosome_name'] == 'MT']

relevant_genes = np.intersect1d(non_mitochondrial_df['Hugo'].values, protein_coding_df['Hugo'].values)
relevant_genes = np.unique(relevant_genes).astype(str)

### Ribosomal

In [None]:
ribosomal_genes_df = pd.read_csv(
    '../data/genes/ribosomal_genes.csv', 
    sep=',', index_col=0, skiprows=1
)

ribosomal_genes = ribosomal_genes_df['Gene'].values.astype(str)

### Filtering

In [None]:
common_genes = np.intersect1d(tumor_df.columns, protein_coding_df['Hugo'].values)
print('%s genes in Kinker et al, %s of which are selected'%(tumor_df.shape[1], common_genes.shape[0]))

tumor_df = tumor_df[common_genes]

## Scanpy filtering

In [None]:
scanpy_data_folder = '../data/Kim/processed/'

In [None]:
# Create lung AnnData
tumor_data_an = AnnData(tumor_df)
print('Initial shape: %s samples x %s genes'%(tumor_data_an.shape[0], 
                                              tumor_data_an.shape[1]))

In [None]:
# Save protein cancer
tumor_df[np.intersect1d(non_mitochondrial_df['Hugo'].values, tumor_df.columns)].to_pickle(
    '../data/Kim/processed/lung_protein_coding.pkl', compression='gzip'
)

### QC analysis

In [None]:
qc_metrics = scanpy.pp.calculate_qc_metrics(tumor_data_an)

In [None]:
ax = sns.jointplot(
        "log1p_total_counts", "log1p_n_genes_by_counts",
        data=qc_metrics[0], kind="hex"
    )

ax.ax_joint.xaxis.label.set_size(20)
ax.ax_joint.xaxis.label.set_color('black')
ax.ax_joint.yaxis.label.set_size(20)
ax.ax_joint.yaxis.label.set_color('black')

plt.tight_layout()
plt.savefig('%s/QC_plot.png'%(figure_folder), dpi=300)

### Filter cells
https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.filter_cells.html

In [None]:
min_genes = 200
filter_cells = scanpy.pp.filter_cells(tumor_data_an, 
                                      min_genes=min_genes)

print('Going from %s cells to %s cells'%(tumor_df.shape[0], tumor_data_an.shape[0]))

### Filter genes
https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.filter_genes.html

In [None]:
min_cells = 3
filter_genes = scanpy.pp.filter_genes(tumor_data_an,
                                      min_cells=min_cells)

print('Going from %s genes to %s genes'%(tumor_df.shape[1], tumor_data_an.shape[1]))

### Mitochondrial percentage

In [None]:
MT_prop_df = tumor_data_an.to_df().T
MT_genes = np.intersect1d(mitochondrial_df['Hugo'].values, MT_prop_df.index)
MT_prop_df['IS_MT'] = (np.isin(MT_prop_df.index, MT_genes))

MT_prop_df = MT_prop_df.groupby('IS_MT').agg('sum').T
MT_prop_df = (MT_prop_df.T / np.sum(MT_prop_df, axis=1)).T

In [None]:
plt.figure(figsize=(4,6))
sns.violinplot(y=MT_prop_df[True], orient='v', alpha=0.7)
sns.swarmplot(y=MT_prop_df[True].sort_values().head(5000).values, color='black', size=2)
sns.swarmplot(y=MT_prop_df[True].sort_values().tail(5000).values, color='black', size=2)

plt.ylabel('MT counts / all counts per cell', fontsize=20, color='black')
plt.yticks(fontsize=15, color='black')
plt.title('MT proportion', fontsize=20, color='black')
plt.tight_layout()
plt.savefig('%s/MT_proportion.png'%(figure_folder), dpi=300)

### Ribosomal percentage

In [None]:
ribo_prop_df = tumor_data_an.to_df().T
ribo_genes = np.intersect1d(ribosomal_genes, ribo_prop_df.index)
ribo_prop_df['IS_RIBO'] = np.isin(ribo_prop_df.index, ribo_genes)

ribo_prop_df = ribo_prop_df.groupby('IS_RIBO').agg('sum').T
ribo_prop_df = (ribo_prop_df.T / np.sum(ribo_prop_df, axis=1)).T

In [None]:
plt.figure(figsize=(4.5,6))
sns.violinplot(y=ribo_prop_df[True], orient='v', alpha=0.7)
sns.swarmplot(y=ribo_prop_df[True].sort_values().head(5000).values, color='black', size=2)
sns.swarmplot(y=ribo_prop_df[True].sort_values().tail(5000).values, color='black', size=2)

plt.ylabel('Ribosomal counts / all counts \n (per cell)', fontsize=20, color='black')
plt.yticks(fontsize=15, color='black')
plt.title('Ribosomal gene proportion', fontsize=20, color='black')
plt.tight_layout()
plt.savefig('%s/Ribo_proportion.png'%(figure_folder), dpi=300)

In [None]:
ribo_filtering_params = {
    'min': 0.05,
    'max': 0.6
}

ribosomal_filtered_samples = ribo_prop_df[(ribo_prop_df[True] < ribo_filtering_params['max'])\
                                          & (ribo_prop_df[True] > ribo_filtering_params['min'])].index
print('%s cells filtered'%(ribo_prop_df.shape[0] - ribosomal_filtered_samples.shape[0]))
tumor_data_an = tumor_data_an[ribosomal_filtered_samples]

### Restriction to protein coding

In [None]:
data_pc_genes = np.intersect1d(tumor_data_an.var.index, protein_coding_df['Hugo'])
tumor_data_an = tumor_data_an[:,data_pc_genes]
print('%s PC genes'%(data_pc_genes.shape[0]))

### Highly variable genes
https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.highly_variable_genes.html

In [None]:
n_top_genes = 3000

scanpy.pp.highly_variable_genes(tumor_data_an, n_top_genes=n_top_genes, flavor='seurat_v3')

In [None]:
high_var_genes = tumor_data_an.var[tumor_data_an.var['highly_variable']].sort_values('highly_variable_rank').index
high_var_genes = np.array(high_var_genes).astype(str)

In [None]:
print('%s highly variable genes'%(high_var_genes.shape[0]))
print('%s are protein coding'%(np.intersect1d(high_var_genes, protein_coding_df['Hugo'].values).shape[0]))
print('%s are MT'%(np.intersect1d(high_var_genes, mitochondrial_df['Hugo'].values).shape[0]))
print('%s are ribosomal'%(np.intersect1d(high_var_genes, ribosomal_genes).shape[0]))

## Manual checking to remove outliers

In [None]:
filtered_tumor_data_an = tumor_data_an[:,high_var_genes]

### Gene-level: number of cells expressing a gene

In [None]:
plot_df = np.sum(filtered_tumor_data_an.to_df() != 0, axis=0) / filtered_tumor_data_an.shape[0]

# fig, axes = plt.subplots(1,2, figsize=(8,5))
axes = plt.figure(constrained_layout=True, figsize=(10,5)).subplot_mosaic(
    """
    ABBB
    """
)
sns.violinplot(y=plot_df, orient='v', ax=axes['A'])
axes['A'].set_ylim(-0.05, 1.05)
axes['A'].set_ylabel('Proportion of non zero per gene', fontsize=20, color='black')
axes['A'].tick_params(axis='both', which='major', labelsize=15)

axes['B'].plot(plot_df.sort_values().values, linewidth=3)
axes['B'].set_ylim(-0.05, 1.05)
axes['B'].tick_params(axis='both', which='major', labelsize=15)
axes['B'].set_xlabel('Gene rank', fontsize=20, color='black')

plt.tight_layout()
plt.savefig('%s/gene_dropout_rank.png'%(figure_folder), dpi=300)
plt.show()

del plot_df

### Sample-level: library size

In [None]:
library_size_df = np.sum(filtered_tumor_data_an.to_df(), axis=1)

axes = plt.figure(constrained_layout=True, figsize=(10,5)).subplot_mosaic(
    """
    ABBB
    """
)
sns.violinplot(y=library_size_df, orient='v', ax=axes['A'])
axes['A'].set_ylabel('Library size per single cell', fontsize=20, color='black')
axes['A'].tick_params(axis='both', which='major', labelsize=15)

axes['B'].plot(library_size_df.sort_values().values, linewidth=3)
axes['B'].tick_params(axis='both', which='major', labelsize=15)
axes['B'].set_xlabel('Cell rank', fontsize=20, color='black')

plt.tight_layout()
plt.savefig('%s/library_size.png'%(figure_folder), dpi=300)
plt.show()

In [None]:
threshold_library_size = {'min_library_size': 200, 'max_library_size':15000}

selected_cells = (library_size_df > threshold_library_size['min_library_size']) 
selected_cells = selected_cells & (library_size_df < threshold_library_size['max_library_size'])
print('%s cells selected out of %s: %s %%'%(
    np.sum(selected_cells),
    filtered_tumor_data_an.shape[0],
    np.sum(selected_cells) / filtered_tumor_data_an.shape[0] * 100
))

filtered_tumor_data_an = filtered_tumor_data_an[selected_cells]

### Total expression per gene

In [None]:
gene_total_exp_df = np.sum(filtered_tumor_data_an.to_df(), axis=0)

# fig, axes = plt.subplots(1,2, figsize=(8,5))
axes = plt.figure(constrained_layout=True, figsize=(10,5)).subplot_mosaic(
    """
    ABBB
    """
)
sns.violinplot(y=gene_total_exp_df, orient='v', ax=axes['A'])
axes['A'].set_ylabel('Proportion of non zero per gene', fontsize=20, color='black')
axes['A'].tick_params(axis='both', which='major', labelsize=15)

axes['B'].plot(gene_total_exp_df.sort_values().values, linewidth=3, marker='+')
axes['B'].tick_params(axis='both', which='major', labelsize=15)
axes['B'].set_xlabel('Gene rank', fontsize=20, color='black')

plt.tight_layout()
plt.savefig('%s/gene_total_exp.png'%(figure_folder), dpi=300, facecolor='white')
plt.show()

In [None]:
top_exp_genes = gene_total_exp_df.sort_values().tail(20).index
gene_total_exp_df.tail(50).to_csv('%s/exp_genes.csv'%(figure_folder))

for g in top_exp_genes:
    sns.distplot(filtered_tumor_data_an[:,g].X)
    plt.title('Expression of %s'%(g), fontsize=20)
    plt.ylabel('Density', fontsize=20, color='black')
    plt.xlabel('Expression', fontsize=20, color='black')
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.tight_layout()
    plt.savefig('%s/exp_genes_%s_after_sample_filter.png'%(figure_folder, g),
                dpi=300, facecolor='white')
    plt.show()

### Save

In [None]:
save_df = pd.DataFrame({
    'min_cells': [min_cells],
    'min_genes': [min_genes],
    'n_top_genes': [n_top_genes],
    'min_library_size': [threshold_library_size['min_library_size']],
    'max_library_size': [threshold_library_size['max_library_size']],
    'min_ribosomal_filtering': [ribo_filtering_params['min']],
    'max_ribosomal_filtering': [ribo_filtering_params['max']]
}).T

In [None]:
print('Save AnnData as h5ad')
filtered_tumor_data_an.var.fillna(-1, inplace=True)
filtered_tumor_data_an.obs['n_genes'] = filtered_tumor_data_an.obs['n_genes'].astype(str)
filtered_tumor_data_an.write('%s/lung_data.h5ad'%(scanpy_data_folder))

print('Save AnnData as csv')
filtered_tumor_data_an.write_csvs('%s/lung_data'%(scanpy_data_folder))

print('Save AnnData as pickled DataFrame')
filtered_tumor_data_an.to_df().to_pickle('%s/lung_data.pkl'%(scanpy_data_folder), compression='gzip')

print('Save AnnData as csv DataFrame')
filtered_tumor_data_an.to_df().to_csv('%s/lung_data.csv'%(scanpy_data_folder))

print('Save parameters')
save_df.to_csv('%s/filtering_params.csv'%(scanpy_data_folder))