# Comparison to Harmony when using the whole data
We use the following implementation: https://github.com/slowkow/harmonypy

In [None]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import umap, gc, pyreadr, pylab, scanpy
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from copy import deepcopy
from anndata import AnnData
import harmonypy as hm

%config IPCompleter.use_jedi = False

In [None]:
figure_folder = './figures/harmony/'

## Import data
### Tumor

In [None]:
tumor_df = pyreadr.read_r('../data/Kim/raw/GSE131907_Lung_Cancer_normalized_log2TPM_matrix.rds')
tumor_df = tumor_df[None].T

### Cell lines

In [None]:
cell_line_df = pd.read_csv('../data/Kinker/raw/CPM_data.txt', sep='\t', index_col=0)
cell_line_df = cell_line_df.T

# Downscale to have same "library size as tumors"
cell_line_df = cell_line_df / 100

### Annotations

In [None]:
# Cell line data
cell_line_annot_df = pd.read_csv(
    '../data/Kinker/raw/Metadata.txt',
    header=[0,1],
    sep='\t'
)
cell_line_annot_df.columns = cell_line_annot_df.columns.droplevel(1)
cell_line_annot_df = cell_line_annot_df[['NAME', 'Cell_line', 'Cancer_type', 'Pool_ID']]
cell_line_annot_df.columns = ['index', 'sample', 'type', 'pool']
cell_line_annot_df['specimen'] = 'CELL_LINE'

cell_line_annot_df['char_type'] = cell_line_annot_df['type'] == 'Lung Cancer'
is_non_lung = ~ cell_line_annot_df['char_type']
cell_line_annot_df.loc[cell_line_annot_df['char_type'], 'char_type'] = 'LUNG'
cell_line_annot_df.loc[is_non_lung, 'char_type'] = 'OTHER'

In [None]:
# Tumor data
tumor_annot_df = pd.read_csv('../data/Kim/raw/GSE131907_Lung_Cancer_cell_annotation.txt', sep='\t')
tumor_annot_df = tumor_annot_df[['Index', 'Sample', 'Cell_type', 'Sample']]
tumor_annot_df.columns = ['index', 'sample', 'type', 'pool']
tumor_annot_df['specimen'] = 'TUMOR'

tumor_annot_df['char_type'] = tumor_annot_df['type'] == 'Epithelial cells'
is_non_epith = ~ tumor_annot_df['char_type']
tumor_annot_df.loc[tumor_annot_df['char_type'], 'char_type'] = 'LUNG'
tumor_annot_df.loc[is_non_epith, 'char_type'] = 'OTHER'

In [None]:
cell_line_annot_df = cell_line_annot_df.set_index('index').loc[cell_line_df.index]
tumor_annot_df = tumor_annot_df.set_index('index').loc[tumor_df.index]
annot_df = pd.concat([cell_line_annot_df, tumor_annot_df])

In [None]:
ccle_annot_df = pd.read_csv('../data/cell_lines/sample_info.csv')

## Scanpy filtering

In [None]:
tumor_data_an = AnnData(tumor_df)
print('Initial shape: %s samples x %s genes'%(tumor_data_an.shape[0], 
                                              tumor_data_an.shape[1]))
gc.collect()

In [None]:
cell_line_data_an = AnnData(cell_line_df)
print('Initial shape: %s samples x %s genes'%(cell_line_data_an.shape[0], 
                                              cell_line_data_an.shape[1]))
gc.collect()
scanpy.pp.log1p(cell_line_data_an, base=2)

### Filtering based on number of zeros

In [None]:
min_genes = 200

scanpy.pp.filter_cells(tumor_data_an, 
                       min_genes=min_genes)
print('Tumor: going from %s cells to %s cells'%(tumor_df.shape[0], tumor_data_an.shape[0]))

scanpy.pp.filter_cells(cell_line_data_an, 
                       min_genes=min_genes)
print('Cell-lines: going from %s cells to %s cells'%(cell_line_df.shape[0], cell_line_data_an.shape[0]))

In [None]:
min_cells = 3

scanpy.pp.filter_genes(tumor_data_an,
                       min_cells=min_cells)
print('Tumor: going from %s genes to %s genes'%(tumor_df.shape[1], tumor_data_an.shape[1]))

scanpy.pp.filter_genes(cell_line_data_an,
                       min_cells=min_cells)
print('Cell-lines: going from %s genes to %s genes'%(cell_line_df.shape[1], cell_line_data_an.shape[1]))

### Filtering based on variability

In [None]:
n_top_genes = 3000

scanpy.pp.highly_variable_genes(tumor_data_an, n_top_genes=n_top_genes, flavor='seurat')
scanpy.pp.highly_variable_genes(cell_line_data_an, n_top_genes=n_top_genes, flavor='seurat')

In [None]:
tumor_df = pd.DataFrame(
    StandardScaler(with_mean=True, with_std=True).fit_transform(tumor_data_an.X[:,tumor_data_an.var['highly_variable']]),
    columns=tumor_data_an.var.index[tumor_data_an.var['highly_variable']],
    index=tumor_data_an.obs.index
)
cell_line_df = pd.DataFrame(
    StandardScaler(with_mean=True, with_std=True).fit_transform(cell_line_data_an.X[:,cell_line_data_an.var['highly_variable']]),
    columns=cell_line_data_an.var.index[cell_line_data_an.var['highly_variable']],
    index=cell_line_data_an.obs.index
)
gc.collect()

## Harmony

In [None]:
common_genes = np.intersect1d(cell_line_df.columns, tumor_df.columns)
common_data_df = pd.concat([
    cell_line_df[common_genes],
    tumor_df[common_genes]
])
metadata = ['CL'] * cell_line_df.shape[0] + ['T'] * tumor_df.shape[0]
pool_values = np.concatenate([
    cell_line_annot_df['pool'].values,
    tumor_annot_df['pool'].values
]).astype(str)
metadata = pd.DataFrame([common_data_df.index.values, metadata, pool_values]).T
metadata.columns = ['idx', 'batch', 'pool']

In [None]:
batch_name = 'pool'
ho = hm.run_harmony(common_data_df, metadata, [batch_name])

## UMAP

In [None]:
metric = 'cosine'
n_neighbors = 15
min_dist = 0.9
n_epochs = 2500
n_pc = 50

umap_integrated_clf = umap.UMAP(
    verbose=5, 
    n_neighbors=n_neighbors,
    metric=metric,
    min_dist=min_dist, 
    n_components=2, 
    n_epochs=n_epochs)

umap_integrated_proj = umap_integrated_clf.fit_transform(
    PCA(n_pc).fit_transform(ho.result().T)
)

In [None]:
umap_integrated_proj_df = pd.DataFrame(
    umap_integrated_proj, 
    index=annot_df.index, 
    columns=['UMAP 1', 'UMAP 2'])
umap_integrated_proj_df = umap_integrated_proj_df
umap_integrated_proj_df = umap_integrated_proj_df.merge(annot_df, how='left', left_index=True, right_index=True)
umap_integrated_proj_df = umap_integrated_proj_df.merge(ccle_annot_df, 
                                                        left_on='sample', 
                                                        right_on='CCLE_Name', 
                                                        how='left')
umap_integrated_proj_df['is_nsclc'] = (umap_integrated_proj_df['lineage_subtype'] == 'NSCLC') | (umap_integrated_proj_df['type'] == 'Epithelial cells')
umap_integrated_proj_df['str']  = umap_integrated_proj_df['is_nsclc'].apply(lambda x: 'NSCLC' if x else 'Other')
umap_integrated_proj_df['str'] = umap_integrated_proj_df['specimen'] + ' ' + umap_integrated_proj_df['str']
umap_integrated_proj_df['plot_str'] = [
    'Cell-line: NSCLC' if x == 'CELL_LINE NSCLC' else (
        'Cell-line: other' if x == 'CELL_LINE Other' else (
            'Tumor: NSCLC' if x == 'TUMOR NSCLC'
            else 'Tumor: micro-environment'
        )
    )
    for x in umap_integrated_proj_df['str']
]

In [None]:
# Save umap
umap_integrated_proj_df.to_csv('%s/UMAP_df_%s.csv'%(figure_folder, batch_name))

In [None]:
# All scatterplot
palette = {
    'Cell-line: NSCLC': '#D62728',#'tab:red',
    'Cell-line: other': (0.984313725490196, 0.6039215686274509, 0.6),#'#F5A3A5',#'#D6ABAB',#'lightcoral',
    'Tumor: NSCLC': (0.12156862745098039, 0.47058823529411764, 0.7058823529411765),
    'Tumor: micro-environment': (0.6509803921568628, 0.807843137254902, 0.8901960784313725)#'#B9F1F6'
}

fig = pylab.figure(figsize=(10,10))
figlegend = pylab.figure(figsize=(10,10))
ax = fig.add_subplot(111)

sns.scatterplot(
    data=umap_integrated_proj_df.sort_values(['specimen', 'is_nsclc'], ascending=False).sample(frac=1),
    x='UMAP 1', y='UMAP 2',  hue='plot_str', 
    alpha=0.8, palette=palette, marker='x', ax=ax
)
ax.set_xlabel('UMAP 1', fontsize=25, color='black')
ax.set_ylabel('UMAP 2', fontsize=25, color='black')
ax.tick_params(labelsize=20, labelcolor='black')

pylab.figlegend(*ax.get_legend_handles_labels(), loc = 'center', ncol=1, fontsize=15)
figlegend.tight_layout()
figlegend.savefig('%s/UMAP_%s_neighbors_%s_metrics_%s_mindist_%s_epochs_%s_legend.png'%(
    figure_folder, batch_name, n_neighbors, metric, min_dist, n_epochs
),dpi=300)
ax.get_legend().remove()
    
fig.tight_layout()
fig.savefig('%s/UMAP_%s_neighbors_%s_metrics_%s_mindist_%s_epochs_%s.png'%(
    figure_folder, batch_name, n_neighbors, metric, min_dist, n_epochs
),dpi=300)

## Tumor analysis

In [None]:
# Zoomed scatterplot
# plt.figure()
plot_df = umap_integrated_proj_df[umap_integrated_proj_df['specimen'] == 'TUMOR']
plot_df = plot_df.sample(plot_df.shape[0])
markers = ['o' if x else '+' for x in plot_df['is_nsclc']]

plt.figure(figsize=(10,10))
g = sns.FacetGrid(
    plot_df,
    col='plot_str',
    hue='sample',
    palette='colorblind',
    sharex=True,
    sharey=True,
    size=6
)
g.map(
    sns.scatterplot,
    'UMAP 1', 
    'UMAP 2', 
    alpha=0.5,
    marker='x'
)
g.set_xlabels('UMAP 1', fontsize=20)
g.set_ylabels('UMAP 2', fontsize=20)
g.set_titles(col_template="{col_name}", row_template="", size=25, color='black')
plt.tight_layout()
plt.savefig('%s/UMAP_neighbors_tumors_sample_%s_metrics_%s_mindist_%s_epochs_%s.png'%(
    figure_folder, 
    n_neighbors, 
    metric, 
    min_dist, 
    n_epochs),
dpi=300)
plt.show()

del plot_df


## Cell lines analysis

In [None]:
# Zoomed scatterplot
plt.figure(figsize=(10,10))
g = sns.scatterplot(data=umap_integrated_proj_df[umap_integrated_proj_df['specimen'] == 'CELL_LINE'], 
                    x='UMAP 1',
                    y='UMAP 2', 
                    hue='sample', 
                    palette='colorblind',
                    alpha=0.5, 
                    marker='x')
plt.xlabel('UMAP 1', fontsize=20, color='black')
plt.ylabel('UMAP 2', fontsize=20, color='black')
plt.xticks(fontsize=15, color='black')
plt.yticks(fontsize=15, color='black')
plt.legend([],[], frameon=False)
plt.tight_layout()
plt.savefig('%s/UMAP_neighbors_cell_lines_sample_%s_metrics_%s_mindist_%s_epochs_%s.png'%(
    figure_folder, 
    n_neighbors, 
    metric, 
    min_dist, 
    n_epochs),
dpi=300)
plt.show()