# Seurat alignment of the whole dataset: cell lines (Kinker et al) and tumors (Kim et al)

In [None]:
library(dplyr)
library(Seurat)

## Import data
### Tumor
Import all tumors and reduce dataset for proto-typing

In [None]:
# IMPORT TPM
tumor_file <- "../data/Kim/raw/GSE131907_Lung_Cancer_normalized_log2TPM_matrix.rds"
tumor_data_df <- readRDS(tumor_file)

In [None]:
n_tumor_subsamples <- 50000
subsampled_col = sample(colnames(tumor_data_df),n_tumor_subsamples)
write.csv(subsampled_col, './output/seurat/subsampled_tumor_samples.csv')
subsampled_tumor_data_df <- tumor_data_df[subsampled_col]
tumor_data_df <- NULL
gc()

### Cell-lines

In [None]:
# UNQUOTE FOR TMP
cell_line_file <- "../data/Kinker/raw/CPM_data.txt"
cell_line_data_df <- read.csv(cell_line_file, sep="\t")
rownames(cell_line_data_df) <- cell_line_data_df$GENE
cell_line_data_df <- subset(cell_line_data_df, select=-c(1))
colnames(cell_line_data_df) <- gsub("\\.", "-", colnames(cell_line_data_df))

In [None]:
# Divide by 100 to downscale to tumors
cell_line_data_df <- cell_line_data_df/100
cell_line_data_df <- log2(cell_line_data_df+1)

In [None]:
cell_line_pool <- read.csv('../data/Kinker/raw/Metadata.txt', sep='\t')
cell_line_pool <- cell_line_pool[-c(1),]
rownames(cell_line_pool) <- cell_line_pool$NAME
cell_line_pool <- cell_line_pool[colnames(cell_line_data_df),]

write.csv(
    colnames(cell_line_data_df), 
    './output/seurat/subsampled_cell_lines_samples.csv'
)

### Create Seurat

In [None]:
common_genes = intersect(rownames(cell_line_data_df), rownames(subsampled_tumor_data_df))

In [None]:
cell_line_SC <- CreateSeuratObject(
    cell_line_data_df[common_genes,], min.cells = 3, min.features = 200, meta.data = cell_line_pool
)
tumor_SC <- CreateSeuratObject(
    subsampled_tumor_data_df[common_genes,], min.cells = 3, min.features = 200
)
int_list <- list(cell_line_SC, tumor_SC)

In [None]:
int_list <- lapply(X = int_list, FUN = function(x) {
    x <- NormalizeData(x)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

## Seurat correction

In [None]:
features <- SelectIntegrationFeatures(object.list = int_list)
int_list <- lapply(X = int_list, FUN = function(x) {
    x <- ScaleData(x, features = features, verbose = FALSE)
    x <- RunPCA(x, features = features, verbose = FALSE)
})

In [None]:
combined.anchors <- FindIntegrationAnchors(object.list=int_list, anchor.features = features, reduction = "rpca")
combined.combined <- IntegrateData(anchorset = combined.anchors)

In [None]:
DefaultAssay(combined.combined) <- "integrated"

# Run the standard workflow for visualization and clustering
combined.combined <- ScaleData(combined.combined, verbose = FALSE)

write.csv(
    combined.combined@assays$integrated@data, 
    './output/seurat/whole_integrated.csv'
)
write.csv(
    combined.combined@assays$integrated@scale.data, 
    './output/seurat/whole_integrated_scaled.csv'
)

## Plot

In [None]:
combined.combined <- RunPCA(combined.combined, npcs = 30, verbose = FALSE)
combined.combined <- RunUMAP(combined.combined, reduction = "pca", dims = 1:30)
combined.combined <- FindNeighbors(combined.combined, reduction = "pca", dims = 1:30)
combined.combined <- FindClusters(combined.combined, resolution = 0.5)

In [None]:
p1 <- DimPlot(combined.combined, reduction = "umap")
write.csv(
    p1$data,  './output/seurat/whole_UMAP.csv'
)
p1