# Support the notebok "LIGER_alignment_all_data": UMAP and plot

In [None]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import umap
import pylab
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from copy import deepcopy

%config IPCompleter.use_jedi = False

## Import data
### Kim

In [None]:
# Tumor data
tumor_annot_df = pd.read_csv(
    '../data/Kim/raw/GSE131907_Lung_Cancer_cell_annotation.txt',
    sep='\t'
)
tumor_annot_df = tumor_annot_df[['Index', 'Sample', 'Cell_type', 'Sample']]
tumor_annot_df.columns = ['index', 'sample', 'type', 'pool']
tumor_annot_df['specimen'] = 'TUMOR'

tumor_annot_df['char_type'] = tumor_annot_df['type'] == 'Epithelial cells'
is_non_epith = ~ tumor_annot_df['char_type']
tumor_annot_df.loc[tumor_annot_df['char_type'], 'char_type'] = 'LUNG'
tumor_annot_df.loc[is_non_epith, 'char_type'] = 'OTHER'

In [None]:
tumor_umis = pd.read_csv(
    '../data/Kim/raw/GSE131907_Lung_Cancer_raw_UMI_matrix.txt',
    nrows=1,
    header=None,
    index_col=0,
    sep='\t'
).values[0].astype(str)

tumor_annot_df = tumor_annot_df.set_index('index').loc[tumor_umis].reset_index()

### Kinker

In [None]:
# Cell line data
cell_line_annot_df = pd.read_csv(
    '../data/Kinker/raw/Metadata.txt',
    header=[0,1],
    sep='\t'
)
cell_line_annot_df.columns = cell_line_annot_df.columns.droplevel(1)
cell_line_annot_df = cell_line_annot_df[['NAME', 'Cell_line', 'Cancer_type', 'Pool_ID']]
cell_line_annot_df.columns = ['index', 'sample', 'type', 'pool']
cell_line_annot_df['specimen'] = 'CELL_LINE'

cell_line_annot_df['char_type'] = cell_line_annot_df['type'] == 'Lung Cancer'
is_non_lung = ~ cell_line_annot_df['char_type']
cell_line_annot_df.loc[cell_line_annot_df['char_type'], 'char_type'] = 'LUNG'
cell_line_annot_df.loc[is_non_lung, 'char_type'] = 'OTHER'

In [None]:
cell_line_cpm = pd.read_csv(
    '../data/Kinker/raw/CPM_data.txt',
    nrows=1,
    header=None,
    index_col=0,
    sep='\t'
).T
cell_line_cpm.columns = ['index']
cell_line_annot_df = cell_line_cpm.merge(cell_line_annot_df, on='index', how='left')

## Analyse integrated

In [None]:
tumor_subsample_file = './output/liger/subsampled_tumor_samples.csv'
cell_line_subsample_file = './output/liger/subsampled_cell_lines_samples.csv'
cell_line_corrected_file = './output/liger/matrix_H_cell_lines.csv'
tumor_corrected_file = './output/liger/matrix_H_tumors.csv'
scaled_corrected_file = './output/liger/matrix_H_normalized.csv'
ccle_annot_file = '../data/cell_lines/sample_info.csv'

In [None]:
cell_line_samples = pd.read_csv(cell_line_subsample_file)['x'].values.astype(str)
cell_line_annot_df = cell_line_annot_df.set_index('index').loc[cell_line_samples].reset_index()

In [None]:
tumor_subsamples = pd.read_csv(tumor_subsample_file)['x'].values.astype(str)
tumor_annot_df = tumor_annot_df.set_index('index').loc[tumor_subsamples].reset_index()
annot_df = pd.concat([cell_line_annot_df, tumor_annot_df])
annot_df = annot_df.set_index('index')

In [None]:
ccle_annot_df = pd.read_csv(ccle_annot_file)
combined_quantile_normalized_df = pd.read_csv(scaled_corrected_file, index_col=0)

## UMAP

In [None]:
metric = 'cosine'
n_neighbors = 15
min_dist = 0.9
n_epochs = 5000

umap_integrated_clf = umap.UMAP(
    verbose=5, 
    n_neighbors=n_neighbors,
    metric=metric,
    min_dist=min_dist, 
    n_components=2, 
    n_epochs=n_epochs)

umap_integrated_proj = umap_integrated_clf.fit_transform(combined_quantile_normalized_df)

In [None]:
umap_integrated_proj_df = pd.DataFrame(
    umap_integrated_proj, 
    index=annot_df.index, 
    columns=['UMAP 1', 'UMAP 2'])
umap_integrated_proj_df = umap_integrated_proj_df
umap_integrated_proj_df = umap_integrated_proj_df.merge(annot_df, how='left', left_index=True, right_index=True)
umap_integrated_proj_df = umap_integrated_proj_df.merge(ccle_annot_df, 
                                                        left_on='sample', 
                                                        right_on='CCLE_Name', 
                                                        how='left')
umap_integrated_proj_df['is_nsclc'] = (umap_integrated_proj_df['lineage_subtype'] == 'NSCLC') | (umap_integrated_proj_df['type'] == 'Epithelial cells')
umap_integrated_proj_df['str']  = umap_integrated_proj_df['is_nsclc'].apply(lambda x: 'NSCLC' if x else 'Other')
umap_integrated_proj_df['str'] = umap_integrated_proj_df['specimen'] + ' ' + umap_integrated_proj_df['str']
umap_integrated_proj_df['plot_str'] = [
    'Cell-line: NSCLC' if x == 'CELL_LINE NSCLC' else (
        'Cell-line: other' if x == 'CELL_LINE Other' else (
            'Tumor: NSCLC' if x == 'TUMOR NSCLC'
            else 'Tumor: micro-environment'
        )
    )
    for x in umap_integrated_proj_df['str']
]

# Save umap
umap_integrated_proj_df.to_csv('./figures/liger/UMAP_df.csv'%(figure_folder))

In [None]:
# All scatterplot
palette = {
    'Cell-line: NSCLC': '#D62728',#'tab:red',
    'Cell-line: other': (0.984313725490196, 0.6039215686274509, 0.6),#'#F5A3A5',#'#D6ABAB',#'lightcoral',
    'Tumor: NSCLC': (0.12156862745098039, 0.47058823529411764, 0.7058823529411765),
    'Tumor: micro-environment': (0.6509803921568628, 0.807843137254902, 0.8901960784313725)#'#B9F1F6'
}

fig = pylab.figure(figsize=(10,10))
figlegend = pylab.figure(figsize=(10,10))
ax = fig.add_subplot(111)

sns.scatterplot(
    data=umap_integrated_proj_df.sort_values(['specimen', 'is_nsclc'], ascending=False).sample(frac=1),
    x='UMAP 1', y='UMAP 2',  hue='plot_str', 
    alpha=0.8, palette=palette, marker='x', ax=ax
)
ax.set_xlabel('UMAP 1', fontsize=25, color='black')
ax.set_ylabel('UMAP 2', fontsize=25, color='black')
ax.tick_params(labelsize=20, labelcolor='black')

pylab.figlegend(*ax.get_legend_handles_labels(), loc = 'center', ncol=1, fontsize=15)
figlegend.tight_layout()
figlegend.savefig('./figures/liger/UMAP_neighbors_%s_metrics_%s_mindist_%s_epochs_%s_legend.png'%(
    n_neighbors, metric, min_dist, n_epochs
), dpi=300)
ax.get_legend().remove()
    
fig.tight_layout()
fig.savefig('./figures/liger/UMAP_neighbors_%s_metrics_%s_mindist_%s_epochs_%s.png'%(
    n_neighbors, metric, min_dist, n_epochs
), dpi=300)

## Tumors

In [None]:
# Zoomed scatterplot
plot_df = umap_integrated_proj_df[umap_integrated_proj_df['specimen'] == 'TUMOR']
plot_df = plot_df.sample(plot_df.shape[0])
markers = ['o' if x else '+' for x in plot_df['is_nsclc']]

plt.figure(figsize=(10,10))
g = sns.FacetGrid(
    plot_df,
    col='plot_str',
    hue='sample',
    palette='colorblind',
    sharex=True,
    sharey=True,
    size=6
)
g.map(
    sns.scatterplot,
    'UMAP 1', 
    'UMAP 2', 
    alpha=0.5,
    marker='x'
)
g.set_xlabels('UMAP 1', fontsize=20)
g.set_ylabels('UMAP 2', fontsize=20)
g.set_titles(col_template="{col_name}", row_template="", size=25, color='black')
plt.tight_layout()
plt.savefig('./figures/liger/UMAP_neighbors_tumors_sample_%s_metrics_%s_mindist_%s_epochs_%s.png'%(
    n_neighbors, metric, min_dist, n_epochs),
dpi=300)
plt.show()

del plot_df

## Cell-lines

In [None]:
# Zoomed scatterplot
plt.figure(figsize=(10,10))
g = sns.scatterplot(data=umap_integrated_proj_df[umap_integrated_proj_df['specimen'] == 'CELL_LINE'], 
                    x='UMAP 1',
                    y='UMAP 2', 
                    hue='sample', 
                    palette='colorblind',
                    alpha=0.5, 
                    marker='x')
plt.xlabel('UMAP 1', fontsize=20, color='black')
plt.ylabel('UMAP 2', fontsize=20, color='black')
plt.xticks(fontsize=15, color='black')
plt.yticks(fontsize=15, color='black')
plt.legend([],[], frameon=False)
plt.tight_layout()
plt.savefig('%s/UMAP_neighbors_cell_lines_sample_%s_metrics_%s_mindist_%s_epochs_%s.png'%(
    figure_folder, 
    n_neighbors, 
    metric, 
    min_dist, 
    n_epochs),
dpi=300)
plt.show()