# Processing of Kinker et al dataset
Pre-processing of treatment-naive epithelial tumor cells from [Kinker et al 2020 Nature Genetics].

In [None]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import gc
import scanpy
from anndata import AnnData
# import gzip
import matplotlib.pyplot as plt
%config IPCompleter.use_jedi = False

figure_folder = './figures/Kinker_et_al_2020'

## Analysis

In [None]:
data_folder = '../data/Kinker/raw/'

In [None]:
annot_file = 'Metadata.txt'
annot_df = pd.read_csv('%s/%s'%(data_folder, annot_file), sep='\t', header=[0,1])
annot_df.columns = annot_df.columns.droplevel(1)

In [None]:
print('%s single cells'%(annot_df.shape[0]))
print('%s unique cell lines'%(np.unique(annot_df['Cell_line']).shape[0]))
print('%s unique tumor cell lines cells'%(annot_df[annot_df['Cancer_type'] == 'Lung Cancer'].shape[0]))
print('%s unique tumor cell lines'%(
    np.unique(annot_df[annot_df['Cancer_type'] == 'Lung Cancer']['Cell_line']).shape[0])
)

### Plot by cell-type

In [None]:
plot_df = annot_df.groupby('Cancer_type').agg('count').sort_values('NAME', ascending=True)
plot_df = plot_df.reset_index()
plot_df['Cancer_type'] = plot_df['Cancer_type'].str.replace(' Cancer', '')

plot_df.plot.bar(x='Cancer_type', y='NAME', figsize=(7,6))
plt.yticks(fontsize=15, color='black')
plt.xticks(fontsize=15, color='black', rotation=90)
plt.xlabel('')
plt.ylabel('')
plt.legend([])
plt.title('Number of cell per cell type', fontsize=20, color='black')
plt.tight_layout()
plt.savefig('%s/cell_type_decomposition.png'%(figure_folder), dpi=300)

cell_type_order = np.array(plot_df.index).astype(str)
del plot_df

## Data loading

In [None]:
data_file = 'UMIcount_data.txt'
annot_file = 'Metadata.txt'

In [None]:
data_df = pd.read_csv('%s/%s'%(data_folder, data_file), sep='\t', header=[0,1,2], index_col=0)
data_df = data_df.T

## Load gene filtering data
Filter genes which are non protein coding and mitochondrial.

### Protein coding

In [None]:
gene_lookup_df = pd.read_csv(
    '../data/genes/pybiomart_gene_status.csv', 
    sep='\t', index_col=0
)
gene_lookup_df = gene_lookup_df[['Hugo', 'chromosome_name', 'status']].drop_duplicates()

protein_coding_df = gene_lookup_df[gene_lookup_df['status'] == 'protein_coding']
print('%s protein coding genes from pybiomart'%(protein_coding_df.shape[0]))

### Non-mitochondrial

In [None]:
chromosome = np.concatenate([np.arange(1,23).astype(str), ['X', 'Y']])
non_mitochondrial_df = gene_lookup_df[gene_lookup_df['chromosome_name'].isin(chromosome)]
mitochondrial_df = gene_lookup_df[gene_lookup_df['chromosome_name'] == 'MT']

In [None]:
relevant_genes = np.intersect1d(non_mitochondrial_df['Hugo'].values, protein_coding_df['Hugo'].values)
relevant_genes = np.unique(relevant_genes).astype(str)

### Ribosomal

In [None]:
ribosomal_genes_df = pd.read_csv(
    '../data/genes/ribosomal_genes.csv', 
    sep=',', index_col=0, skiprows=1
)

ribosomal_genes = ribosomal_genes_df['Gene'].values.astype(str)

## Filtering

In [None]:
common_genes = np.intersect1d(data_df.columns, protein_coding_df['Hugo'].values)
print('%s genes in Kinker et al, %s of which are selected'%(data_df.shape[1], common_genes.shape[0]))

filtered_data_df = data_df[common_genes]

## Restriction to NSCLC

In [None]:
ccle_annot_df = pd.read_csv('../data/cell_lines/sample_info.csv')
ccle_annot_df = ccle_annot_df[ccle_annot_df['lineage_subtype'] == 'NSCLC']

overlappinp_nsclc_cell_lines = np.intersect1d(
    filtered_data_df.index.get_level_values('Cell_line'),
    np.unique(ccle_annot_df['CCLE_Name'].astype(str))
).astype(str)

print('%s OVERLAPPING CELL LINES'%(overlappinp_nsclc_cell_lines.shape[0]))

nsclc_data_df = filtered_data_df.iloc[
    filtered_data_df.index.get_level_values('Cell_line').isin(overlappinp_nsclc_cell_lines)
]

# Create lung AnnData
nsclc_data_an = AnnData(nsclc_data_df)

### Save protein coding genes with only UMI (for UCell)

In [None]:
nsclc_data_df.index = nsclc_data_df.index.get_level_values(0) + '-' + nsclc_data_df.index.get_level_values(1)
nsclc_data_df[np.intersect1d(non_mitochondrial_df['Hugo'].values, nsclc_data_df.columns)].to_pickle(
    '../data/Kinker/processed/NSCLC_protein_coding.pkl',
    compression='gzip'
)

## Filtering for NSCLC data alone
### QC metrics

In [None]:
nsclc_qc_metrics = scanpy.pp.calculate_qc_metrics(nsclc_data_an)

In [None]:
ax = sns.jointplot(
    "log1p_total_counts", "log1p_n_genes_by_counts",
    data=nsclc_qc_metrics[0], kind="hex"
)
ax.ax_joint.xaxis.label.set_size(20)
ax.ax_joint.xaxis.label.set_color('black')
ax.ax_joint.yaxis.label.set_size(20)
ax.ax_joint.yaxis.label.set_color('black')

plt.tight_layout()
plt.savefig('%s/NSCLC_QC_plot.png'%(figure_folder), dpi=300)

### Filter cells

In [None]:
min_genes = 200
filter_cells = scanpy.pp.filter_cells(nsclc_data_an, min_genes=min_genes)
print('Going from %s cells to %s cells'%(nsclc_data_df.shape[0], nsclc_data_an.shape[0]))

### Filter genes

In [None]:
min_cells = 3
filter_genes = scanpy.pp.filter_genes(nsclc_data_an, min_cells=min_cells)
print('Going from %s genes to %s genes'%(nsclc_data_df.shape[1], nsclc_data_an.shape[1]))

### Mitochondrial percentage

In [None]:
MT_prop_df = nsclc_data_an.to_df().T
MT_genes = np.intersect1d(mitochondrial_df['Hugo'].values, MT_prop_df.index)
MT_prop_df['IS_MT'] = (np.isin(MT_prop_df.index, MT_genes))

MT_prop_df = MT_prop_df.groupby('IS_MT').agg('sum').T
MT_prop_df = (MT_prop_df.T / np.sum(MT_prop_df, axis=1)).T

plt.figure(figsize=(4,6))
sns.violinplot(y=MT_prop_df[True], orient='v', alpha=0.7)
sns.swarmplot(y=MT_prop_df[True].values, color='black', size=2)

plt.ylabel('MT counts / all counts per cell', fontsize=20, color='black')
plt.yticks(fontsize=15, color='black')
plt.title('MT proportion', fontsize=20, color='black')
plt.tight_layout()
plt.savefig('%s/NSCLC_MT_proportion.png'%(figure_folder), dpi=300)

### Ribosomal proportion

In [None]:
ribo_prop_df = nsclc_data_an.to_df().T
ribo_genes = np.intersect1d(ribosomal_genes, ribo_prop_df.index)
ribo_prop_df['IS_RIBO'] = np.isin(ribo_prop_df.index, ribo_genes)

ribo_prop_df = ribo_prop_df.groupby('IS_RIBO').agg('sum').T
ribo_prop_df = (ribo_prop_df.T / np.sum(ribo_prop_df, axis=1)).T

plt.figure(figsize=(4.5,6))
sns.violinplot(y=ribo_prop_df[True], orient='v', alpha=0.7)
sns.swarmplot(y=ribo_prop_df[True], color='black', size=2)

plt.ylabel('Ribosomal counts / all counts \n (per cell)', fontsize=20, color='black')
plt.yticks(fontsize=15, color='black')
plt.title('Ribosomal gene proportion', fontsize=20, color='black')
plt.tight_layout()
plt.savefig('%s/NSCLC_Ribo_proportion.png'%(figure_folder), dpi=300)

<b>Ribosomal filtering:</b> Decided to cut below 0.1 and 0.5

In [None]:
nsclc_ribo_filtering_params = {
    'min': 0.1,
    'max': 0.5
}

ribosomal_filtered_samples = ribo_prop_df[(ribo_prop_df[True] < nsclc_ribo_filtering_params['max'])\
                                          & (ribo_prop_df[True] > nsclc_ribo_filtering_params['min'])].index
print('%s cells filtered'%(ribo_prop_df.shape[0] - ribosomal_filtered_samples.shape[0]))
nsclc_data_an = nsclc_data_an[ribosomal_filtered_samples]

### Restriction to protein coding

In [None]:
data_pc_genes = np.intersect1d(nsclc_data_an.var.index, protein_coding_df['Hugo'])
nsclc_data_an = nsclc_data_an[:,data_pc_genes]
print('%s PC genes'%(data_pc_genes.shape[0]))

### Highly variable genes
https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.highly_variable_genes.html

In [None]:
n_top_genes = 3000

scanpy.pp.highly_variable_genes(nsclc_data_an, 
                                n_top_genes=n_top_genes, 
                                flavor='seurat_v3')

nsclc_high_var_genes = nsclc_data_an.var[nsclc_data_an.var['highly_variable']].sort_values('highly_variable_rank').index
nsclc_high_var_genes = np.array(nsclc_high_var_genes).astype(str)

print('%s highly variable genes'%(nsclc_high_var_genes.shape[0]))
print('%s are protein coding'%(np.intersect1d(nsclc_high_var_genes, protein_coding_df['Hugo'].values).shape[0]))
print('%s are MT'%(np.intersect1d(nsclc_high_var_genes, mitochondrial_df['Hugo'].values).shape[0]))
print('%s are ribosomal'%(np.intersect1d(nsclc_high_var_genes, ribosomal_genes).shape[0]))

### Check and removing outliers

In [None]:
nsclc_data_an = nsclc_data_an[:,nsclc_high_var_genes]

#### Gene-level: number of cells expressing a gene

In [None]:
plot_df = np.sum(nsclc_data_an.to_df() != 0, axis=0) / nsclc_data_an.shape[0]

# fig, axes = plt.subplots(1,2, figsize=(8,5))
axes = plt.figure(constrained_layout=True, figsize=(10,5)).subplot_mosaic(
    """
    ABBB
    """
)
sns.violinplot(y=plot_df, orient='v', ax=axes['A'])
axes['A'].set_ylim(-0.05, 1.05)
axes['A'].set_ylabel('Proportion of non zero per gene', fontsize=20, color='black')
axes['A'].tick_params(axis='both', which='major', labelsize=15)

axes['B'].plot(plot_df.sort_values().values, linewidth=3)
axes['B'].set_ylim(-0.05, 1.05)
axes['B'].tick_params(axis='both', which='major', labelsize=15)
axes['B'].set_xlabel('Gene rank', fontsize=20, color='black')

plt.tight_layout()
plt.savefig('%s/NSCLC_gene_dropout_rank.png'%(figure_folder), dpi=300, facecolor='white')
plt.show()

del plot_df

#### Sample-level: library size

In [None]:
library_size_df = np.sum(nsclc_data_an.to_df(), axis=1)

axes = plt.figure(constrained_layout=True, figsize=(10,5)).subplot_mosaic(
    """
    ABBB
    """
)
sns.violinplot(y=library_size_df, orient='v', ax=axes['A'])
axes['A'].set_ylabel('Library size per single cell', fontsize=20, color='black')
axes['A'].tick_params(axis='both', which='major', labelsize=15)

axes['B'].plot(library_size_df.sort_values().values, linewidth=3)
axes['B'].tick_params(axis='both', which='major', labelsize=15)
axes['B'].set_xlabel('Cell rank', fontsize=20, color='black')

plt.tight_layout()
plt.savefig('%s/NSCLC_library_size.png'%(figure_folder), dpi=300, facecolor='white')
plt.show()

In [None]:
nsclc_threshold_library_size = {'min_library_size': 500, 'max_library_size':15000}

selected_cells = (library_size_df > nsclc_threshold_library_size['min_library_size']) 
selected_cells = selected_cells & (library_size_df < nsclc_threshold_library_size['max_library_size'])
print('%s cells selected out of %s: %s %%'%(
    np.sum(selected_cells),
    nsclc_data_df.shape[0],
    np.sum(selected_cells) / nsclc_data_df.shape[0] * 100
))

nsclc_data_an = nsclc_data_an[selected_cells]

### Total expression per gene

In [None]:
gene_total_exp_df = np.sum(nsclc_data_an.to_df(), axis=0)

# fig, axes = plt.subplots(1,2, figsize=(8,5))
axes = plt.figure(constrained_layout=True, figsize=(10,5)).subplot_mosaic(
    """
    ABBB
    """
)
sns.violinplot(y=gene_total_exp_df, orient='v', ax=axes['A'])
axes['A'].set_ylabel('Proportion of non zero per gene', fontsize=20, color='black')
axes['A'].tick_params(axis='both', which='major', labelsize=15)

axes['B'].plot(gene_total_exp_df.sort_values().values, linewidth=3, marker='+')
axes['B'].tick_params(axis='both', which='major', labelsize=15)
axes['B'].set_xlabel('Gene rank', fontsize=20, color='black')

plt.tight_layout()
plt.savefig('%s/NSCLC_gene_total_exp.png'%(figure_folder), dpi=300, facecolor='white')
plt.show()

### Save lung cell lines

In [None]:
nsclc_save_df = pd.DataFrame({
    'min_cells': [min_cells],
    'min_genes': [min_genes],
    'n_top_genes': [n_top_genes],
    'min_library_size': [nsclc_threshold_library_size['min_library_size']],
    'max_library_size': [nsclc_threshold_library_size['max_library_size']],
    'min_ribosomal_filtering': [nsclc_ribo_filtering_params['min']],
    'max_ribosomal_filtering': [nsclc_ribo_filtering_params['max']]
}).T

In [None]:
nsclc_data_folder = '../data/Kinker/processed/'

In [None]:
print('Save AnnData as h5ad')
# lung_data_filtered_an.obs.fillna(-1, inplace=True)
# lung_data_filtered_an.var.fillna(-1, inplace=True)
# lung_data_filtered_an.obs['n_genes'] = lung_data_filtered_an.obs['n_genes'].astype(str)
# lung_data_filtered_an.write('%s/lung_data.h5ad'%(save_lung_data_folder))

print('Save AnnData as csv')
nsclc_data_an.write_csvs('%s/NSCLC_data'%(nsclc_data_folder))

print('Save AnnData as pickled DataFrame')
nsclc_data_an.to_df().to_pickle('%s/NSCLC_data.pkl'%(nsclc_data_folder), compression='gzip')

print('Save parameters')
nsclc_save_df.to_csv('%s/filtering_params.csv'%(nsclc_data_folder))

print('Save AnnData as csv DataFrame')
nsclc_data_an.to_df().to_csv('%s/NSCLC_data.csv'%(nsclc_data_folder))