# Support the notebok "Seurat_alignment_all_data": UMAP and plotting

In [None]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import umap, pylab
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from copy import deepcopy

%config IPCompleter.use_jedi = False

In [None]:
seurat_umap_file = './output/seurat/whole_UMAP.csv'%(data_file)

if data_file not in os.listdir('./figures/seurat_all_data/'):
    os.mkdir('./figures/seurat_all_data/%s'%(data_file))
figure_folder = './figures/seurat/%s/'%(data_file)

## Import data
### Kim et al

In [None]:
# Tumor data
tumor_annot_df = pd.read_csv(
    '../data/Kim/raw/GSE131907_Lung_Cancer_cell_annotation.txt',
    sep='\t'
)
tumor_annot_df = tumor_annot_df[['Index', 'Sample', 'Cell_type', 'Sample']]
tumor_annot_df.columns = ['index', 'sample', 'type', 'pool']
tumor_annot_df['specimen'] = 'TUMOR'

tumor_annot_df['char_type'] = tumor_annot_df['type'] == 'Epithelial cells'
is_non_epith = ~ tumor_annot_df['char_type']
tumor_annot_df.loc[tumor_annot_df['char_type'], 'char_type'] = 'LUNG'
tumor_annot_df.loc[is_non_epith, 'char_type'] = 'OTHER'

In [None]:
# Map back to samples
tumor_umis = pd.read_csv(
    '../data/Kim/raw/GSE131907_Lung_Cancer_raw_UMI_matrix.txt',
    nrows=1,
    header=None,
    index_col=0,
    sep='\t'
).values[0].astype(str)

tumor_annot_df = tumor_annot_df.set_index('index').loc[tumor_umis].reset_index()

### Kinker

In [None]:
# Cell line data
cell_line_annot_df = pd.read_csv(
    '../data/Kinker/raw/Metadata.txt',
    header=[0,1],
    sep='\t'
)
cell_line_annot_df.columns = cell_line_annot_df.columns.droplevel(1)
cell_line_annot_df = cell_line_annot_df[['NAME', 'Cell_line', 'Cancer_type', 'Pool_ID']]
cell_line_annot_df.columns = ['index', 'sample', 'type', 'pool']
cell_line_annot_df['specimen'] = 'CELL_LINE'

cell_line_annot_df['char_type'] = cell_line_annot_df['type'] == 'Lung Cancer'
is_non_lung = ~ cell_line_annot_df['char_type']
cell_line_annot_df.loc[cell_line_annot_df['char_type'], 'char_type'] = 'LUNG'
cell_line_annot_df.loc[is_non_lung, 'char_type'] = 'OTHER'

In [None]:
cell_line_cpm = pd.read_csv(
    '../data/Kim/raw/CPM_data.txt',
    nrows=1,
    header=None,
    index_col=0,
    sep='\t'
).T
cell_line_cpm.columns = ['index']
cell_line_annot_df = cell_line_cpm.merge(cell_line_annot_df, on='index', how='left')

## Analyse integrated

In [None]:
tumor_subsample_file = './output/seurat/subsampled_tumor_samples.csv'
cell_line_subsample_file = './output/seurat/subsampled_cell_lines_samples.csv'
integrated_file = './output/seurat/whole_integrated_%s.csv'
integrated_scale_file = './output/seurat/whole_integrated_scaled.csv'
ccle_annot_file = '../data/cell_lines/sample_info.csv'

In [None]:
cell_line_samples = pd.read_csv(cell_line_subsample_file)['x'].values.astype(str)
cell_line_annot_df = cell_line_annot_df.set_index('index').loc[cell_line_samples].reset_index()
ccle_annot_df = pd.read_csv(ccle_annot_file)

In [None]:
tumor_subsamples = pd.read_csv(tumor_subsample_file)['x'].values.astype(str)
tumor_annot_df = tumor_annot_df.set_index('index').loc[tumor_subsamples].reset_index()
annot_df = pd.concat([cell_line_annot_df, tumor_annot_df])
annot_df = annot_df.set_index('index')

In [None]:
integrated_data_df = pd.read_csv(integrated_file, index_col=0).T
integrated_scaled_data_df = pd.read_csv(integrated_scale_file, index_col=0).T
integrated_data_df.index = [e.replace('.', '-') for e in integrated_data_df.index]
integrated_scaled_data_df.index = [e.replace('.', '-') for e in integrated_scaled_data_df.index]
annot_df = annot_df.loc[integrated_data_df.index.values]

assert annot_df.shape[0] == integrated_data_df.shape[0]
assert annot_df.shape[0] == integrated_scaled_data_df.shape[0]
np.testing.assert_array_equal(
    integrated_scaled_data_df.index.values,
    integrated_data_df.index.values
)

## UMAP (from Seurat package)

In [None]:
seurat_umap_file = './output/seurat/whole_UMAP.csv'%
seurat_umap_df = pd.read_csv(seurat_umap_file, index_col=0)

In [None]:
seurat_umap_df = seurat_umap_df.merge(annot_df, how='left', left_index=True, right_index=True)
seurat_umap_df = seurat_umap_df.merge(
    ccle_annot_df, left_on='sample',  right_on='CCLE_Name',  how='left'
)

seurat_umap_df['is_nsclc'] = (seurat_umap_df['lineage_subtype'] == 'NSCLC') | (seurat_umap_df['type'] == 'Epithelial cells')
seurat_umap_df['str']  = seurat_umap_df['is_nsclc'].apply(lambda x: 'NSCLC' if x else 'Other')
seurat_umap_df['str'] = seurat_umap_df['specimen'] + ' ' + seurat_umap_df['str']
seurat_umap_df['plot_str'] = [
    'Cell-line: NSCLC' if x == 'CELL_LINE NSCLC' else (
        'Cell-line: other' if x == 'CELL_LINE Other' else (
            'Tumor: NSCLC' if x == 'TUMOR NSCLC'
            else 'Tumor: micro-environment'
        )
    )
    for x in seurat_umap_df['str']
]

In [None]:
# All scatterplot
plt.figure(figsize=(10,10))
palette = {
    'Cell-line: NSCLC': '#fc0303',#'tab:red',
    'Cell-line: other': '#ffb0b0',#'lightcoral',
    'Tumor: NSCLC': '#027d13',
    'Tumor: micro-environment': '#c0fac8'
}
sns.scatterplot(data=seurat_umap_df.sort_values(['specimen', 'is_nsclc'], ascending=False).sample(frac=1),
                x='UMAP_1',
                y='UMAP_2', 
                hue='plot_str', 
                alpha=0.5,
                palette=palette,
                marker='x')
plt.xlabel('UMAP 1', fontsize=25, color='black')
plt.ylabel('UMAP 2', fontsize=25, color='black')
plt.xticks(fontsize=20, color='black')
plt.yticks(fontsize=20, color='black')
plt.legend(fontsize=20, ncol=1, loc=2)
plt.tight_layout()
plt.savefig('%s/UMAP_seurat.png'%(figure_folder),
dpi=300)
plt.show()

## UMAP (computed in Python)

In [None]:
metric = 'cosine'
n_neighbors = 15
min_dist = 0.9
n_epochs = 5000
n_pc = 50

In [None]:
umap_integrated_clf = umap.UMAP(
    verbose=5, 
    n_neighbors=n_neighbors,
    metric=metric,
    min_dist=min_dist, 
    n_components=2, 
    n_epochs=n_epochs)

umap_integrated_proj = umap_integrated_clf.fit_transform(
    PCA(n_pc).fit_transform(integrated_scaled_data_df)
)

In [None]:
umap_integrated_proj_df = pd.DataFrame(
    umap_integrated_proj, 
    index=annot_df.index, 
    columns=['UMAP 1', 'UMAP 2'])
umap_integrated_proj_df = umap_integrated_proj_df
umap_integrated_proj_df = umap_integrated_proj_df.merge(annot_df, how='left', left_index=True, right_index=True)
umap_integrated_proj_df = umap_integrated_proj_df.merge(ccle_annot_df, 
                                                        left_on='sample', 
                                                        right_on='CCLE_Name', 
                                                        how='left')
umap_integrated_proj_df['is_nsclc'] = (umap_integrated_proj_df['lineage_subtype'] == 'NSCLC') | (umap_integrated_proj_df['type'] == 'Epithelial cells')
umap_integrated_proj_df['str']  = umap_integrated_proj_df['is_nsclc'].apply(lambda x: 'NSCLC' if x else 'Other')
umap_integrated_proj_df['str'] = umap_integrated_proj_df['specimen'] + ' ' + umap_integrated_proj_df['str']
umap_integrated_proj_df['plot_str'] = [
    'Cell-line: NSCLC' if x == 'CELL_LINE NSCLC' else (
        'Cell-line: other' if x == 'CELL_LINE Other' else (
            'Tumor: NSCLC' if x == 'TUMOR NSCLC'
            else 'Tumor: micro-environment'
        )
    )
    for x in umap_integrated_proj_df['str']
]
umap_integrated_proj_df.to_csv(seurat_umap_file.replace('whole_UMAP', 'python_UMAP'))

### Load UMAP plot values

In [None]:
umap_integrated_proj_df = pd.read_csv(
    seurat_umap_file.replace('whole_UMAP', 'python_UMAP'), sep=',', index_col=0
)

### Plot UMAP

In [None]:
# All scatterplot
palette = {
    'Cell-line: NSCLC': '#D62728',#'tab:red',
    'Cell-line: other': (0.984313725490196, 0.6039215686274509, 0.6),#'#F5A3A5',#'#D6ABAB',#'lightcoral',
    'Tumor: NSCLC': (0.12156862745098039, 0.47058823529411764, 0.7058823529411765),
    'Tumor: micro-environment': (0.6509803921568628, 0.807843137254902, 0.8901960784313725)#'#B9F1F6'
}

fig = pylab.figure(figsize=(10,10))
figlegend = pylab.figure(figsize=(10,10))
ax = fig.add_subplot(111)

sns.scatterplot(
    data=umap_integrated_proj_df.sort_values(['specimen', 'is_nsclc'], ascending=False).sample(frac=1),
    x='UMAP 1', y='UMAP 2',  hue='plot_str', 
    alpha=0.5, palette=palette, marker='x', ax=ax
)


ax.set_xlabel('UMAP 1', fontsize=25, color='black')
ax.set_ylabel('UMAP 2', fontsize=25, color='black')
ax.tick_params(labelsize=20, labelcolor='black')
plt.legend(fontsize=20, ncol=4, loc=2)

pylab.figlegend(*ax.get_legend_handles_labels(), loc = 'center', ncol=2, fontsize=15)
figlegend.tight_layout()
figlegend.savefig('%s/UMAP_neighbors_%s_metrics_%s_mindist_%s_epochs_%s_legend.png'%(
    figure_folder, n_neighbors, metric, min_dist, n_epochs
),dpi=300)
ax.get_legend().remove()
    
fig.tight_layout()
fig.savefig('%s/UMAP_neighbors_%s_metrics_%s_mindist_%s_epochs_%s.png'%(
    figure_folder, n_neighbors, metric, min_dist, n_epochs
),dpi=300)

## Tumors

In [None]:
# Zoomed scatterplot
# plt.figure()
plot_df = umap_integrated_proj_df[umap_integrated_proj_df['specimen'] == 'TUMOR']
plot_df = plot_df.sample(plot_df.shape[0])
markers = ['o' if x else '+' for x in plot_df['is_nsclc']]

plt.figure(figsize=(10,10))
g = sns.FacetGrid(
    plot_df,
    col='plot_str',
    hue='sample',
    palette='colorblind',
    sharex=True,
    sharey=True,
    size=6
)
g.map(
    sns.scatterplot,
    'UMAP 1', 
    'UMAP 2', 
    alpha=0.5,
    marker='x'
)
g.set_xlabels('UMAP 1', fontsize=20)
g.set_ylabels('UMAP 2', fontsize=20)
g.set_titles(col_template="{col_name}", row_template="", size=25, color='black')
plt.tight_layout()
plt.savefig('%s/UMAP_neighbors_tumors_sample_%s_metrics_%s_mindist_%s_epochs_%s.png'%(
    figure_folder, 
    n_neighbors, 
    metric, 
    min_dist, 
    n_epochs),
dpi=300)
plt.show()

del plot_df
