# Kinker & Kim : Analysis of feature attribution results. UMAP and low-dimensional view
This script loads a Sobolev Alignment instance already computed and generates the UMAP from Figures 4 and 5.

In [None]:
from supporting_scripts.custom_import import *

sys.path.insert(0, '/home/s.mourragui/science/sobolev_alignment/src/sobolev_alignment/')
from sobolev_alignment import SobolevAlignment, KRRApprox
from supporting_scripts.custom_import import *

%config IPCompleter.use_jedi = False

In [None]:
output_folder = './output/'
n_jobs = 10

palette = {
    'CELL LINE': '#D62728', 
    'TUMOR': (0.12156862745098039, 0.47058823529411764, 0.7058823529411765)
}
markers = {'CELL LINE': 'p', 'TUMOR': 'X'}
sizes = {'CELL LINE': 20, 'TUMOR': 15}

cmap = matplotlib.colors.LinearSegmentedColormap.from_list('', ['tab:red',"white","tab:blue"])

In [None]:
from supporting_scripts.make_folders import make_figure_folder

figure_folder = make_figure_folder(output_folder)

if 'UMAP_plots' not in os.listdir(figure_folder):
    os.mkdir('%s/UMAP_plots/'%(figure_folder))
umap_plot_figure = '%s/UMAP_plots/'%(figure_folder)

if 'linear_terms' not in os.listdir(output_folder):
    os.mkdir('%s/linear_terms'%(output_folder))
    
n_iter = np.max([int(re.findall(r'[0-9]+', e)[0]) for e in os.listdir(output_folder) if 'iter' in e])

## Data import
### scRNA-seq data used for Sobolev Alignment

In [None]:
from supporting_scripts.read_data import read_data

X_source, X_target, cell_line_annot_df, tumor_annot_df, gene_names, barcodes_df = read_data(return_barcodes=True)
X_input = {
    'source': X_source,
    'target': X_target
}

### CCLE and cell lines general info

In [None]:
# Load CCLE info
cell_line_metadata_file = '../data/Kinker/raw/Metadata.txt'
ccle_annot_file = '../data/cell_lines/sample_info.csv'

cell_line_inter_df = pd.read_csv(
    cell_line_metadata_file,
    sep='\t',
    header=[0,1]
)
cell_line_inter_df.columns = cell_line_inter_df.columns.get_level_values(0)
cell_line_annot_df = cell_line_annot_df.merge(
    cell_line_inter_df, 
    left_on='barcode',
    right_on='NAME', 
    how='left'
)
del cell_line_inter_df

ccle_annot_df = pd.read_csv(ccle_annot_file)
cell_line_annot_df = cell_line_annot_df.merge(
    ccle_annot_df, 
    left_on='sample', 
    right_on='CCLE_Name', 
    how='left'
)

### Tumor annotations

In [None]:
tumor_annot_inter_df = pd.read_csv('../data/Kim/raw/GSE131907_Lung_Cancer_cell_annotation.txt', sep='\t')
tumor_annot_df = tumor_annot_df.merge(
    tumor_annot_inter_df, 
    left_on='Index', right_on='Index', how='left'
)
del tumor_annot_inter_df
assert np.unique(tumor_annot_df['Cell_type']).shape[0] == 1
tumor_annot_df['barcode'] = tumor_annot_df['Index']

In [None]:
patient_info = pd.read_excel('./data/Kim/raw/GSE131907_Lung_Cancer_Feature_Summary.xlsx', index_col=0)
patient_info.columns = patient_info.iloc[1].values
patient_info = patient_info.iloc[2:]
patient_info['sample'] = patient_info['Samples'].str.replace('_', '-')
tumor_annot_df = tumor_annot_df.merge(patient_info, left_on='Sample', right_on='Samples', how='left')

### Aggregate annotations

In [None]:
annot_df = {
    'source': cell_line_annot_df,
    'target': tumor_annot_df
}

combined_annot_df = pd.concat(
    [X_source.obs, X_target.obs]
)
combined_annot_df['type'] = ['CELL_LINE'] * X_source.shape[0] + ['TUMOR'] * X_target.shape[0]

is_tumor = combined_annot_df['type'] == 'TUMOR'
combined_annot_df.loc[is_tumor, 'UMI'] += '_' + combined_annot_df.loc[is_tumor, 'sample']
del is_tumor

## Import Sobolev Alignment

In [None]:
iter_idx = 0
sobolev_alignment_clf = {}

# Load Sobolev Alignment
for kernel_type in ['laplacian', 'gaussian']:
    if 'iter_%s_nu_%s'%(iter_idx, kernel_type) not in os.listdir(output_folder):
        continue
    sobolev_alignment_clf[kernel_type] = SobolevAlignment.load(
        '%s/iter_%s_nu_%s/sobolev_alignment_model'%(output_folder, iter_idx, kernel_type),
        with_krr=True,
        with_model=False
    )

    sobolev_alignment_clf[kernel_type].training_data = {
        'source': X_source,
        'target': X_target
    }
    
    sobolev_alignment_clf[kernel_type].krr_log_input_ = True

## Embedding

In [None]:
latent_embedding = {
    x: pd.read_csv('%s/scvi_embedding_%s.csv'%(output_folder, x), header=None, sep=' ')
    for x in ['source', 'target']
}

## Check error

In [None]:
plt.figure(figsize=(8,3))
prediction_latent_corr = {}
for data_source in ['source', 'target']:
    print('START %s'%(data_source))
    
    prediction_latent_corr[data_source] = []
    krr_pred = sobolev_alignment_clf['laplacian'].approximate_krr_regressions_[data_source].transform(
        torch.Tensor(
            sobolev_alignment_clf['laplacian']._frobenius_normalisation(
                data_source,
                np.log10(X_input[data_source].X + 1),
                frob_norm_source=True
        )
    ))
    target_spearman_corr = []
    for x in range(krr_pred.shape[1]):
        prediction_latent_corr[data_source].append(
            scipy.stats.spearmanr(krr_pred[:,x], latent_embedding[data_source][x])[0]
        )
        
    sns.distplot(
        prediction_latent_corr[data_source], 
        label='CELL LINE' if data_source == 'source' else 'TUMOR',
        kde_kws={"lw": 3}
    )
    
plt.xlabel('Spearman correlation between\n KRR and scVI', fontsize=20, color='black')
plt.xticks(fontsize=15)
plt.ylabel('Proportion', fontsize=20, color='black')
plt.yticks([], [])
plt.legend(fontsize=15)
plt.tight_layout()
plt.savefig('%s/hist_spearman_corr_reconstruction_latent.png'%(figure_folder))
plt.show()

### Null model

In [None]:
null_model_files = [e for e in os.listdir(output_folder) if 'null' in e]
print('AVAILABLE FILES:\n%s'%('\n'.join(null_model_files)))

In [None]:
null_model_file = 'null_model_laplacian_perm_100.csv'
kernel_type = 'laplacian'

plt.figure(figsize=(8,4))
null_model_df = pd.read_csv('%s/%s'%(output_folder, null_model_file), index_col=0)
plt.plot(
    sobolev_alignment_clf[kernel_type].principal_angles, 
    marker='s',
    markersize=10
)
plt.axhline(np.max(null_model_df.values[0]), linestyle='--', color='grey', linewidth=3)
plt.xlabel('SPV number', fontsize=20, color='black')
plt.ylabel('SPV similarity', fontsize=20, color='black')
plt.xticks(
    np.arange(null_model_df.shape[0], step=1),
    np.arange(null_model_df.shape[0], step=1)+1,
    fontsize=15,
    color='black'
)
plt.yticks(fontsize=15,
          color='black')
plt.tight_layout()
plt.savefig('%s/PV_similarity_null_model_%s.png'%(figure_folder, kernel_type), dpi=300)
plt.show()

print('%s shared latent factors'%(
    np.sum(sobolev_alignment_clf[kernel_type].principal_angles > np.max(null_model_df.values[0]))
))
del kernel_type

In [None]:
sample_permut_principal_angles = []
for gsea_subfolder in os.listdir('%s/GSEA_null/'%(output_folder)):
    if not os.path.isdir('%s/GSEA_null/%s'%(output_folder, gsea_subfolder)):
        continue
    sample_permut_principal_angles.append(
        SobolevAlignment.load(
            '%s/GSEA_null/%s'%(output_folder, gsea_subfolder),
            with_krr=True,
            with_model=False
        ).principal_angles
    )

## Interpolation

In [None]:
interpolated_proj_df = sobolev_alignment_clf['laplacian'].compute_consensus_features(X_input, n_similar_pv=12)
interpolated_proj_df.index = pd.MultiIndex.from_frame(combined_annot_df)
interpolated_proj_df = interpolated_proj_df.reset_index()
interpolated_proj_df['type'] = interpolated_proj_df['type'].str.replace('CELL_LINE', 'CELL LINE')

## Projection on interpolated joint

In [None]:
n_similar_pv = 12

In [None]:
interpolated_proj_df.set_index(['UMI', 'sample', 'pool', 'type'])[range(n_similar_pv)].to_csv(
    '%s/interpolated_projected_data.csv'%(figure_folder)
)

In [None]:
# Project on optimal interpolation time
metric = 'cosine'
n_neighbors = 15
min_dist = 0.2
n_epochs = 5000

umap_interpolated_pv_clf = umap.UMAP(
    verbose=5, 
    n_neighbors=n_neighbors,
    metric=metric,
    min_dist=min_dist, 
    n_components=2,
    init='spectral',
    learning_rate=0.2,
    n_epochs=n_epochs
)

umap_proj_df = umap_interpolated_pv_clf.fit_transform(
    interpolated_proj_df[range(n_similar_pv)].values
)
umap_proj_df = pd.DataFrame(umap_proj_df, columns=['UMAP 1', 'UMAP 2'])

for x in ['UMAP 1', 'UMAP 2']:
    if x in interpolated_proj_df.columns:
        del interpolated_proj_df[x]
interpolated_proj_df = pd.concat([
    interpolated_proj_df.reset_index(drop=True), 
    umap_proj_df.reset_index(drop=True)],
    axis=1
)


In [None]:
plt.figure(figsize=(12, 12))

fig = pylab.figure(figsize=(10,10))
figlegend = pylab.figure(figsize=(10,10))
ax = fig.add_subplot(111)

sns.scatterplot(
    data=interpolated_proj_df.sample(interpolated_proj_df.shape[0]), 
    x='UMAP 1', y='UMAP 2',
    hue='type', style='type', size='type',
    alpha=0.9,
    palette=palette, sizes=sizes, markers=markers,
    ax=ax          
)

ax.set_xlabel('UMAP 1', fontsize=20, color='black')
ax.set_ylabel('UMAP 2', fontsize=20, color='black')
ax.tick_params(axis='both', labelsize=15, color='black')

pylab.figlegend(*ax.get_legend_handles_labels(), loc = 'upper left', ncol=3, fontsize=15)
figlegend.tight_layout()
figlegend.savefig(
    '%s/UMAP_combined_interpolated_data_top_PV_embedding_metric_%s_neighbors_%s_mindist_%s_legend.png'%(
        figure_folder, metric, n_neighbors, min_dist
    ),
    dpi=300
)
ax.get_legend().remove()
    
fig.tight_layout()
fig.savefig(
    '%s/UMAP_combined_interpolated_data_top_PV_embedding_metric_%s_neighbors_%s_mindist_%s.png'%(
        umap_plot_figure, metric, n_neighbors, min_dist
    ), 
    dpi=300
           )

## Interpolation
### MNN correction

In [None]:
# Save for R usage
interpolated_proj_df.set_index(['UMI', 'sample', 'pool', 'type'])[range(n_similar_pv)].to_csv(
    '%s/interpolated_projected_data.csv'%(figure_folder)
)

In [None]:
importr('batchelor')
importr('scater')
importr('batchelor')
importr('wordspace')
importr('dplyr')
importr('uwot')

robjects.r('''
    figure_folder <- "../figures/"
    combined_interpolated_data <- read.csv(paste(figure_folder, "interpolated_projected_data.csv", sep=""))

    # Restrict data
    cell_line_interpolated_data <- combined_interpolated_data[combined_interpolated_data$type == "CELL LINE",]
    tumor_interpolated_data <- combined_interpolated_data[combined_interpolated_data$type == "TUMOR",]

    # Format
    cell_line_interpolated_data <- t(subset(cell_line_interpolated_data, select=-c(1,2,3,4)))
    tumor_interpolated_data <- t(subset(tumor_interpolated_data, select=-c(1,2,3,4)))
    
    # Correct with MNN
    interpolated_corrected.signal<- mnnCorrect(
        cell_line_interpolated_data,
        tumor_interpolated_data,
        cos.norm.in = TRUE, 
        cos.norm.out = TRUE
    )
    
    # Save
    write.csv(
        interpolated_corrected.signal@assays@data$corrected,
        paste(figure_folder, "interpolated_projected_mnn_corrected_data.csv", sep="")
    )
''')

In [None]:
# import data after MNN correction
MNN_corrected_interpolation = pd.read_csv(
    '%s/interpolated_projected_mnn_corrected_data.csv'%(figure_folder),
    index_col=0
)
MNN_corrected_interpolation = MNN_corrected_interpolation.T
MNN_corrected_interpolation.index = pd.MultiIndex.from_frame(interpolated_proj_df[['type', 'UMI', 'sample', 'pool']])
MNN_corrected_interpolation.index = MNN_corrected_interpolation.index.swaplevel(1,2)

### UMAP

In [None]:
# Project on optimal interpolation time
metric = 'cosine'
n_neighbors = 20
min_dist = 0.15
n_epochs = 5000

umap_mnn_interpolated_pv_clf = umap.UMAP(
    verbose=5, 
    n_neighbors=n_neighbors,
    metric=metric,
    min_dist=min_dist, 
    n_components=2,
    learning_rate=2.,
    init='spectral',
    n_epochs=n_epochs
)

umap_mnn_interpolated_proj_df = umap_mnn_interpolated_pv_clf.fit_transform(
    MNN_corrected_interpolation
)
umap_mnn_interpolated_proj_df = pd.DataFrame(umap_mnn_interpolated_proj_df, columns=['UMAP 1', 'UMAP 2'])
umap_mnn_interpolated_proj_df.index = MNN_corrected_interpolation.index
umap_mnn_interpolated_proj_df = umap_mnn_interpolated_proj_df.reset_index()

dump(
    umap_mnn_interpolated_pv_clf,
    open('%s/UMAP_combined_data_interpolated_embedding_metric_%s_neighbors_%s_mindist_%s.pkl'%(
        umap_plot_figure, metric, n_neighbors, min_dist
    ), 'wb')
)

### Cell lines vs Tumors

In [None]:
plt.figure(figsize=(12, 12))

fig = pylab.figure(figsize=(10,10))
figlegend = pylab.figure(figsize=(10,10))
ax = fig.add_subplot(111)

sns.scatterplot(
    data=umap_mnn_interpolated_proj_df.sample(umap_mnn_interpolated_proj_df.shape[0]), 
    x='UMAP 1', y='UMAP 2',
    hue='type', style='type', size='type',
    palette=palette, markers=markers, sizes=sizes,
    alpha=0.9, ax=ax
)

ax.set_xlabel('UMAP 1', fontsize=30, color='black')
ax.set_ylabel('UMAP 2', fontsize=30, color='black')
ax.tick_params(axis='both', labelsize=20, color='black')

pylab.figlegend(*ax.get_legend_handles_labels(), loc = 'upper left', ncol=1, fontsize=15)
figlegend.tight_layout()
figlegend.savefig(
    '%s/UMAP_combined_data_interpolated_embedding_metric_%s_neighbors_%s_mindist_%s_legend.png'%(
        umap_plot_figure, metric, n_neighbors, min_dist
    ), dpi=300
)
ax.get_legend().remove()
    
fig.tight_layout()
fig.savefig(
    '%s/UMAP_combined_data_interpolated_embedding_metric_%s_neighbors_%s_mindist_%s.png'%(
        umap_plot_figure, metric, n_neighbors, min_dist
    ), dpi=300
)

### Colored by samples

In [None]:
g = sns.relplot(
    data=umap_mnn_interpolated_proj_df.sample(umap_mnn_interpolated_proj_df.shape[0]),
    x='UMAP 1', y='UMAP 2',
    col='type', hue='sample', style='type', size='type',
    markers=markers, sizes=sizes,
    height=7, legend='brief'
)

plt.savefig(
    '%s/UMAP_combined_data_interpolated_embedding__sample_metric_%s_neighbors_%s_mindist_%s.png'%(
        umap_plot_figure, metric, n_neighbors, min_dist
    ),
    dpi=300
)

plt.show()

# Sabe legend
plt.figure(figsize=(10,10))
plt.grid(False)
plt.axis('off')
plt.xticks([])
plt.yticks([])
h,l = g.axes[0][0].get_legend_handles_labels()
plt.legend(h, l, loc=0, ncol=3, fontsize=15)
plt.tight_layout()
plt.savefig(
    '%s/UMAP_combined_data_interpolated_embedding__sample_metric_%s_neighbors_%s_mindist_%s_legend.png'%(
        umap_plot_figure, metric, n_neighbors, min_dist
    ),
    dpi=300
)

## Colored by PV value

In [None]:
# Merge PV values
global_mnn_interpolated_proj_df = umap_mnn_interpolated_proj_df.merge(
    interpolated_proj_df.rename(columns={e:'PV %s'%(e) for e in range(n_similar_pv)}),
    on=['UMI', 'sample', 'pool', 'type'],
    how='left',
    suffixes=('_MNN', '_raw')
)

In [None]:
# PV_view_sizes = {'CELL LINE': 30, 'TUMOR': 20}

for PV_number in range(n_similar_pv):
    fig = pylab.figure(figsize=(10,10))
    figlegend = pylab.figure(figsize=(10,10))
    ax = fig.add_subplot(111)

    plt.figure(figsize=(12, 12))
    sns.scatterplot(
        data=global_mnn_interpolated_proj_df.sample(global_mnn_interpolated_proj_df.shape[0]),
        x='UMAP 1_MNN', y='UMAP 2_MNN',
        hue='PV %s_clipped'%(PV_number), style='type', size='type',
        markers=markers, sizes=sizes,
        palette=cmap, ax=ax
    )

    ax.set_xlabel('UMAP 1', fontsize=30, color='black')
    ax.set_ylabel('UMAP 2', fontsize=30, color='black')
    ax.tick_params(axis='both', labelsize=20, color='black')

    pylab.figlegend(*ax.get_legend_handles_labels(), loc = 'upper left', ncol=3, fontsize=15)
    figlegend.tight_layout()
    figlegend.savefig(
        '%s/UMAP_combined_data_interpolated_embedding_PV_%s_metric_%s_neighbors_%s_mindist_%s_legend.png'%(
            umap_plot_figure, PV_number, metric, n_neighbors, min_dist
        ),
        dpi=300
    )
    ax.get_legend().remove()

    fig.tight_layout()
    fig.savefig(
        '%s/UMAP_combined_data_interpolated_embedding_PV_%s_metric_%s_neighbors_%s_mindist_%s.png'%(
            umap_plot_figure, PV_number, metric, n_neighbors, min_dist
        ), 
        dpi=300
               )
    plt.show()