In [None]:
library(GenomicRanges)
library(TCGAbiolinks)
library(survminer)
library(survival)
library(SummarizedExperiment)
library(tidyverse)
library(DESeq2)

In [None]:
# Set the folder path where the TSV files are located
folder_path <- "/scratch/project/stseq/Onkar/BigData/Melanoma_TCGA_bulk_GE/"


# Get a list of all TSV files in the folder
files <- list.files(path = folder_path, pattern = "\\.tsv$", full.names = TRUE)

# Initialize an empty list to store data frames
df_list <- list()

# Loop through each file and read it into a list, extracting only the tpm_unstranded column
for (file in files) {
  # Read the TSV file
  df <- read.table(file, header = TRUE, sep = "\t")
  
  # Extract gene_name and tpm_unstranded
  gene_name <- df$gene_name
  tpm_data <- df$tpm_unstranded
  
  # Assign the column name as the file name without the ".tsv" extension
  colname <- gsub(".rna_seq.augmented_star_gene_counts.tsv$", "", basename(file))  # Remove ".tsv" from file name
  
  # Add the tpm_unstranded data to the list with the appropriate column name
  # Assign gene_name as rownames
  df_list[[colname]] <- setNames(tpm_data, gene_name)
}

# Combine all data frames into one by rows, with gene_name as rownames
combined_df <- do.call(cbind, df_list)

# View the combined data frame with gene_name as row names
head(combined_df)



In [None]:
combined_df<-combined_df[-c(1:4),]
head(combined_df)


In [None]:
library(dplyr)
library(survminer)
library(survival)

In [None]:
coldata<-read.csv("TCGA/Mel/Clinical_data_mel.txt",sep="\t",header=TRUE)
coldata_reordered <- as.data.frame(coldata[match(colnames(combined_df), coldata$V1), ])
#names(coldata_reordered)<-c("ID","deceased","submitter_id","overall_survival")
coldata_reordered<-coldata_reordered[-(length(rownames(coldata_reordered))),]
rownames(coldata_reordered)<-coldata_reordered$ID
combined_df<-combined_df[,rownames(coldata_reordered)]
combined_df[combined_df == 0] <- 1
coldata_reordered$deceased <- ifelse(coldata_reordered$deceased == "dead", TRUE, FALSE)


In [None]:
coldata<-read.csv("TCGA/Mel/Clinical_data_mel.txt",sep="\t",header=TRUE)
names(coldata)

In [None]:
intersect(colnames(combined_df),coldata$submitter_id)

In [None]:
head(colnames(combined_df))

In [None]:
setwd("Skin_Melanoma_TCGA/")

In [None]:
# Assuming coldata is your data frame with the 'project' column
library(dplyr)

# Subset coldata for project "TCGA-SKCM"
coldata_TCGASKCM <- coldata %>%
  filter(project == "TCGA-SKCM")

# View the result
head(coldata_TCGASKCM)


In [None]:
# script to download data from TCGA using TCGAbiolinks

# get a list of projects
gdcprojects <- getGDCprojects()
getProjectSummary('TCGA-SKCM')



# building a query
query_TCGA <- GDCquery(project = 'TCGA-SKCM',
         data.category = 'Transcriptome Profiling')
output_query_TCGA <- getResults(query_TCGA)


# build a query to retrieve gene expression data ------------
query_TCGA <- GDCquery(project = 'TCGA-SKCM',
                       data.category = 'Transcriptome Profiling',
                       experimental.strategy = 'RNA-Seq',
                       workflow.type = 'STAR - Counts',
                       access = 'open',   data.type = "Gene Expression Quantification"
                       )

getResults(query_TCGA)

In [None]:
query_TCGA_meta<-getResults(query_TCGA)

In [None]:
names(coldata_TCGASKCM)
names(query_TCGA_meta)

In [None]:
# Merge coldata_TCGASKCM and query_TCGA_meta by submitter_id
print(coldata_TCGASKCM[1,])


In [None]:
#grep("TCGA-EE-A2GU", coldata_TCGASKCM$submitter_id, value = TRUE)
#grep("TCGA-EE-A2GU", query_TCGA_meta$cases.submitter_id, value = TRUE)

grep("TCGA-EE-A2GU", coldata_TCGASKCM, value = FALSE)
grep("TCGA-FS-A1ZZ", query_TCGA_meta, value = FALSE)


In [None]:
# Use grep to find the row index of the value "TCGA-FS-A1ZZ" in the 'submitter_id' column
matching_rows <- grep("TCGA-EE-A2GU", query_TCGA_meta$cases.submitter_id, value = FALSE)

# Print the row numbers where it was found
matching_rows


In [None]:
diagnosis_id

In [None]:
query_TCGA_meta$submitter_id

In [None]:
rownames(coldata_TCGASKCM)<-coldata_TCGASKCM$submitter_id


# Check for duplicate values in the 'submitter_id' column
sum(duplicated(query_TCGA_meta$cases.submitter_id))
# View the duplicate submitter_id values
query_TCGA_meta$cases.submitter_id[duplicated(query_TCGA_meta$cases.submitter_id)]
# Remove duplicates and keep only the first occurrence
query_TCGA_meta <- query_TCGA_meta[!duplicated(query_TCGA_meta$cases.submitter_id), ]
rownames(query_TCGA_meta)<-query_TCGA_meta$cases.submitter_id


In [None]:
meta_combined <- merge(coldata_TCGASKCM, query_TCGA_meta, by = "row.names", all = FALSE)


In [None]:
rownames(meta_combined)<-meta_combined$Row.names
head(meta_combined)

In [None]:
length(intersect(rownames(meta_combined), rownames(coldata_TCGASKCM)))

In [None]:
dim(meta_combined)
dim(coldata_TCGASKCM)

In [None]:
# Ensure rownames of meta_combined are set (assuming 'file_id' is the matching column)
rownames(meta_combined) <- meta_combined$file_id

# Get the mapping of file_id to row names
new_colnames <- rownames(meta_combined)[match(colnames(combined_df), meta_combined$file_id)]

# Replace the column names of combined_df with the matched row names
colnames(combined_df) <- new_colnames

# Check the renamed columns in combined_df
head(combined_df)


In [None]:
head(colnames(combined_df))
head(meta_combined$file_id )

In [None]:
length(intersect(colnames(combined_df),meta_combined$file_id))

In [None]:
# Clean up both colnames and file_id by removing extra spaces
colnames(combined_df) <- trimws(colnames(combined_df))
meta_combined$file_id <- trimws(meta_combined$file_id)

# Check if there are still any leading/trailing spaces
intersect(colnames(combined_df), meta_combined$file_id)


In [None]:
head(meta_combined)

In [None]:
meta_combined$analysis_submitter_id<-gsub("_star__counts","",meta_combined$analysis_submitter_id)

In [None]:
combined_df_OG<-combined_df
# Ensure rownames of meta_combined are set (assuming 'file_id' is the matching column)
rownames(meta_combined) <- meta_combined$Row.names

# Get the mapping of file_id to row names
new_colnames <- rownames(meta_combined)[match(colnames(combined_df), meta_combined$analysis_submitter_id)]

# Replace the column names of combined_df with the matched row names
colnames(combined_df) <- new_colnames

# Check the renamed columns in combined_df
head(combined_df)


In [None]:
rownames(coldata)<-coldata$submitter_id
# Convert to characters if they are factors
colnames(combined_df) <- as.character(colnames(combined_df))
rownames(coldata) <- as.character(rownames(coldata))

# Get the intersect of column names of combined_df and row names of coldata
matching_names <- intersect(colnames(combined_df), rownames(coldata))

# Subset combined_df based on the matching names
combined_df_subset <- combined_df[, matching_names]

# View the result to check the subsetted data
head(combined_df_subset)


In [None]:
dim(combined_df_subset)
dim(coldata)

In [None]:
# Subset combined_df based on the matching names
coldata_subset <- coldata[matching_names, ]

# View the result to check the subsetted data
dim(coldata_subset)


In [None]:
# NOw use counts from combined_df_subset and meta from coldata_subset
write.table(combined_df_subset,"/scratch/project_mnt/S0010/Prakrithi/Skin_Melanoma_TCGA/SKCM_tpm_mat.txt",sep="\t", quote = FALSE)
write.table(coldata_subset,"/scratch/project_mnt/S0010/Prakrithi/Skin_Melanoma_TCGA/SKCM_meta.txt",sep="\t", quote = FALSE)


In [None]:
head(colnames(combined_df_subset))
head(rownames(coldata_subset))

In [None]:
# read while rerunning
combined_df_subset<-read.csv("/scratch/project_mnt/S0010/Prakrithi/Skin_Melanoma_TCGA/SKCM_tpm_mat.txt", sep="\t", header=TRUE, check.names=FALSE)
coldata_subset<-read.csv("/scratch/project_mnt/S0010/Prakrithi/Skin_Melanoma_TCGA/SKCM_meta.txt",sep="\t")


In [None]:

# Remove duplicate gene entries, keeping only the first occurrence
combined_df_subset <- combined_df_subset[!duplicated(combined_df_subset$GENE), ]

# Set row names as the GENE column
rownames(combined_df_subset) <- combined_df_subset$GENE

# Optionally, remove the GENE column if it's now redundant
combined_df_subset$GENE <- NULL
head(combined_df_subset)

In [None]:

rounded_matrix <- round(combined_df_subset, digits = 0)
head(rounded_matrix)

# Check if there are any NA values in the entire data frame
any(is.na(rounded_matrix))
# Count the number of NA values in the entire data frame
sum(is.na(rounded_matrix))
# Find which rows in a specific column are NA (e.g., 'column_name')
which(is.na(rounded_matrix))
# Replace all NA values with 0 in the entire data frame
rounded_matrix[is.na(rounded_matrix)] <- 0


In [None]:
dds <- DESeqDataSetFromMatrix(countData = as.matrix(rounded_matrix),
                              colData = coldata_subset,
                              design = ~ 1)


vsd <- vst(dds, blind = TRUE,fitType='parametric') #nsub = nrow(dds
matrix_vst <- assay(vsd)

In [None]:
# Define the list of genes you want to stratify by
genes_to_stratify <- c("THSD8","STC2","CHRFAM7A","AK5","SLC12A3","MED12L","PLAC8","NME8","CLEC4C","PACSIN1","LILRA4","C5AR2","CXCL2","TREML2","IRF8","AGT","WDFY4","NCR1","SLC9A3","LYPD1","KLRC1","KLRC3","ITGAD","ADAMDEC1","FCRL3","IL18RAP","HORMAD1","ANGPT1","BMP8B","DLX1","LRP2","ANO4","UPK1A","COL22A1","ITGA10","STRA6","CD80","BMX","CABP4","NKG7","SH2D1A","DTHD1","SELL","IL2RB","UBD","MPZ","CXCL9","CTLA4","P2RY10","IL3RA")  # Replace with your list of genes

# Create a new dataframe to store the stratified data
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = 'gene_id') %>%
  gather(key = 'case_id', value = 'counts', -gene_id)

# Filter to include only the genes in the list
filtered <- combined %>%
  filter(gene_id %in% genes_to_stratify)

# Calculate the combined expression for each sample
combined_expression <- filtered %>%
  spread(key = 'gene_id', value = 'counts') %>%
  select(-case_id) %>%
  rowSums()

# Add the combined expression to the dataframe
combined$combined_expression <- combined_expression[match(combined$case_id, filtered$case_id)]

# Calculate the median of the combined expression across all samples
median_combined_expression <- median(combined$combined_expression, na.rm = TRUE)

# Initialize a new strata column based on the combined median expression
combined$strata <- ifelse(combined$combined_expression >= median_combined_expression, "HIGH", "LOW")

# Merge the clinical data to get survival information
combined <- merge(combined, coldata_subset, by.x = 'case_id', by.y = 'submitter_id')

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit the survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)

# Plot the Kaplan-Meier curve
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"))


In [None]:
grep("THSD8",rownames(rounded_matrix))
rounded_matrix[56334,]

# NEW ANALYSIS (CORRECT)

In [None]:
# read while rerunning
combined_df_subset<-read.csv("/scratch/project_mnt/S0010/Prakrithi/Skin_Melanoma_TCGA/SKCM_tpm_mat.txt", sep="\t", header=TRUE, check.names=FALSE)
coldata_subset<-read.csv("/scratch/project_mnt/S0010/Prakrithi/Skin_Melanoma_TCGA/SKCM_meta.txt",sep="\t")


# Remove duplicate gene entries, keeping only the first occurrence
combined_df_subset <- combined_df_subset[!duplicated(combined_df_subset$GENE), ]

# Set row names as the GENE column
rownames(combined_df_subset) <- combined_df_subset$GENE

# Optionally, remove the GENE column if it's now redundant
combined_df_subset$GENE <- NULL
head(combined_df_subset)

rounded_matrix <- round(combined_df_subset, digits = 0)
head(rounded_matrix)

# Check if there are any NA values in the entire data frame
any(is.na(rounded_matrix))
# Count the number of NA values in the entire data frame
sum(is.na(rounded_matrix))
# Find which rows in a specific column are NA (e.g., 'column_name')
which(is.na(rounded_matrix))
# Replace all NA values with 0 in the entire data frame
rounded_matrix[is.na(rounded_matrix)] <- 0


In [None]:
dds <- DESeqDataSetFromMatrix(countData = as.matrix(rounded_matrix),
                              colData = coldata_subset,
                              design = ~ 1)


vsd <- vst(dds, blind = TRUE,fitType='parametric') #nsub = nrow(dds
matrix_vst <- assay(vsd)

In [None]:
# DE genes
genes_to_stratify <- c("THSD8", "STC2", "CHRFAM7A", "AK5", "SLC12A3", "MED12L", "PLAC8", "NME8", 
                        "CLEC4C", "PACSIN1", "LILRA4", "C5AR2", "CXCL2", "TREML2", "IRF8", "AGT", 
                        "WDFY4", "NCR1", "SLC9A3", "LYPD1", "KLRC1", "KLRC3", "ITGAD", "ADAMDEC1", 
                        "FCRL3", "IL18RAP", "HORMAD1", "ANGPT1", "BMP8B", "DLX1", "LRP2", "ANO4", 
                        "UPK1A", "COL22A1", "ITGA10", "STRA6", "CD80", "BMX", "CABP4", "NKG7", 
                        "SH2D1A", "DTHD1", "SELL", "IL2RB", "UBD", "MPZ", "CXCL9", "CTLA4", "P2RY10", 
                        "IL3RA")

# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the selected genes
filtered <- combined %>%
  filter(gene_id %in% genes_to_stratify)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)

# Plot Kaplan-Meier curve
pdf("Skin_Melanoma_TCGA/TCGA_mel_DE_genes.pdf", width = 8, height = 9)
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red
dev.off()

ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red


In [None]:
# Ensure genes_to_stratify exist in matrix_vst
filtered <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id) %>%
  filter(gene_id %in% genes_to_stratify) %>%
  spread(key = "gene_id", value = "counts")

# Merge with combined dataframe
combined <- merge(combined, filtered, by.x = "case_id", by.y = "case_id", all.x = TRUE)

combined <- combined %>%
  mutate(status = ifelse(strata == "HIGH", 1, 0))
# Fit Cox proportional hazards model for Gene1
formula_string <- paste("Surv(overall_survival, status) ~", paste(genes_to_stratify, collapse = " + "))
cox_fit <- coxph(as.formula(formula_string), data = combined)

# Print results
summary(cox_fit)

# LR genes

In [None]:
LRgenes<-c("COL1A1","DDR2","COL1A1","ITGB1","COL1A1","CD44","COL1A1","CD36","COL1A1","ITGA5","COL1A1","ITGA2","COL1A1","DDR1","COL3A1","DDR2","COL3A1","DDR1","COL1A2","ITGB1","COL1A2","ITGA2","COL1A2","CD44","CCL19","CXCR3","CCL5","ACKR1","AZGP1","ITGAV","CCL5","CCR1","FGF1","FGFR1","IL34","CSF1R","ICAM3","ITGB2","CD34","SELL","FGF1","CD44","FGF2","CD44","COL8A1","ITGA1","LGALS3","LAG3","NRG1","ERBB3","TNFSF14","TNFRSF14")
# Create a new dataframe to store the stratified data


# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the selected genes
filtered <- combined %>%
  filter(gene_id %in% LRgenes)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)

# Plot Kaplan-Meier curve
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red

pdf("Skin_Melanoma_TCGA/TCGA_mel_LR_genes.pdf", width = 8, height = 9)
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red
dev.off()

In [None]:
# Ensure genes_to_stratify exist in matrix_vst
filtered <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id) %>%
  filter(gene_id %in% LRgenes) %>%
  spread(key = "gene_id", value = "counts")

# Merge with combined dataframe
combined <- merge(combined, filtered, by.x = "case_id", by.y = "case_id", all.x = TRUE)

combined <- combined %>%
  mutate(status = ifelse(strata == "HIGH", 1, 0))
# Fit Cox proportional hazards model for Gene1
formula_string <- paste("Surv(overall_survival, status) ~", paste(LRgenes, collapse = " + "))
cox_fit <- coxph(as.formula(formula_string), data = combined)

# Print results
summary(cox_fit)

In [None]:
gsmap_genes_mel4<-c("GTF2F2","WDFY1","PBRM1","BRD4","HGS","WDR82","DPP9","CBX3","RABGGTB","POLR2G","CLPTM1L","PSMB2","MED15","UBE2R2","APH1A","TBCD","PSMA1","CDK4","CSNK1D","CCT5","DDX24","VPS29","OAS1","M6PR","EFTUD2","CNDP2","FKBP3","PSMD4","MLLT10","DDX41","SUPT16H","SPPL2A","ATOX1","SEPTIN9","MXD4","EIF3B","SAP30BP","SLC11A2","STAT2","ATP6V0D1","VMP1","UBQLN1","KPNB1","BANF1","ACIN1","AKT2","SLTM","NARF","CIAO1","NPEPL1")

# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the selected genes
filtered <- combined %>%
  filter(gene_id %in% gsmap_genes_mel4)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)

# Plot Kaplan-Meier curve
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red



In [None]:
gsmap_top50all_mel4<-c("GTF2F2","PSME2","WDFY1","PBRM1","BRD4","HGS","WDR82","DPP9","CBX3","PML","RABGGTB","POLR2G","CLPTM1L","PSMB2","MED15","UBE2R2","XAF1","APH1A","TBCD","PSMA1","TAP1","CDK4","CSNK1D","CCT5","DDX24","NFKB2","VPS29","OAS1","M6PR","C6orf62","MOV10","CDC42SE1","EFTUD2","IFI16","LARP4B","CNDP2","FKBP3","PSMD4","CMTM6","MLLT10","TAP2","DDX41","CHKB","SUPT16H","SPPL2A","UBP1","SCO2","ATOX1","SEPTIN9","MXD4")

# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the selected genes
filtered <- combined %>%
  filter(gene_id %in% gsmap_top50all_mel4)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)

# Plot Kaplan-Meier curve
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red



In [None]:
gsmap_common_mel<-c("WDFY1","PBRM1","TBCD","CDK4","M6PR","EFTUD2","PSMD4","MXD4","STAT2","PFKL","MAP3K11","MXI1","EIF4E2","SLC39A1","PSMA7","ATP6V1B2","AC015802.6","HSPE1","PLSCR1","KCTD20","UROD","TRA2B","RBM6","ST6GALNAC2","DR1","PPP4R2","ANKLE2","SMARCD2","OSBPL9","MLEC","SRPRA","HDGF","NELFE","UBXN6","GRSF1","COMMD4","TXNL4A","CAPZA2","ILVBL","UBAP2L","CFAP97","PA2G4","USP22","GPAT4","FBXW5","PTTG1IP","GADD45GIP1","COPG1","ZNF106","WBP2")

#gsmap_top50all_mel4<-c("GTF2F2","PSME2","WDFY1","PBRM1","BRD4","HGS","WDR82","DPP9","CBX3","PML","RABGGTB","POLR2G","CLPTM1L","PSMB2","MED15","UBE2R2","XAF1","APH1A","TBCD","PSMA1","TAP1","CDK4","CSNK1D","CCT5","DDX24","NFKB2","VPS29","OAS1","M6PR","C6orf62","MOV10","CDC42SE1","EFTUD2","IFI16","LARP4B","CNDP2","FKBP3","PSMD4","CMTM6","MLLT10","TAP2","DDX41","CHKB","SUPT16H","SPPL2A","UBP1","SCO2","ATOX1","SEPTIN9","MXD4")

# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the selected genes
filtered <- combined %>%
  filter(gene_id %in% gsmap_common_mel)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)


pdf("Skin_Melanoma_TCGA/TCGA_mel_gsmap_comm_genes.pdf", width = 8, height = 9)
# Plot Kaplan-Meier curve
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red
dev.off()
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red




In [None]:
## gsmap all CT top common

gsmap_all_top_comm<-c("WDFY1","PBRM1","PML","TBCD","CDK4","M6PR","EFTUD2","IFI16","PSMD4","MXD4","STAT2","PFKL","MAP3K11","C21orf91","MXI1","EIF4E2","NDUFV1","SLC39A1","SNRNP200","PSMA7","ATP6V1B2","AC015802.6","HSPE1","CLASRP","PLSCR1","KCTD20","UROD","RHOT2","TRA2B","RBM6","ST6GALNAC2","DR1","RTRAF","PPP4R2","ANKLE2","SMARCD2","OSBPL9","MLEC","SRPRA","HDGF","ETS1","NELFE","UBXN6","GRSF1","COMMD4","TXNL4A","CAPZA2","SRRT","ILVBL","UBAP2L")



# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the selected genes
filtered <- combined %>%
  filter(gene_id %in% gsmap_all_top_comm)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)



ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red




In [None]:
genes_to_stratify <- c("THSD8", "STC2", "CHRFAM7A", "AK5", "SLC12A3", "MED12L", "PLAC8", "NME8", 
                        "CLEC4C", "PACSIN1", "LILRA4", "C5AR2", "CXCL2", "TREML2", "IRF8", "AGT", 
                        "WDFY4", "NCR1", "SLC9A3", "LYPD1", "KLRC1", "KLRC3", "ITGAD", "ADAMDEC1", 
                        "FCRL3", "IL18RAP", "HORMAD1", "ANGPT1", "BMP8B", "DLX1", "LRP2", "ANO4", 
                        "UPK1A", "COL22A1", "ITGA10", "STRA6", "CD80", "BMX", "CABP4", "NKG7", 
                        "SH2D1A", "DTHD1", "SELL", "IL2RB", "UBD", "MPZ", "CXCL9", "CTLA4", "P2RY10", 
                        "IL3RA")
length(genes_to_stratify)
LRgenes<-c("COL1A1","DDR2","COL1A1","ITGB1","COL1A1","CD44","COL1A1","CD36","COL1A1","ITGA5","COL1A1","ITGA2","COL1A1","DDR1","COL3A1","DDR2","COL3A1","DDR1","COL1A2","ITGB1","COL1A2","ITGA2","COL1A2","CD44","CCL19","CXCR3","CCL5","ACKR1","AZGP1","ITGAV","CCL5","CCR1","FGF1","FGFR1","IL34","CSF1R","ICAM3","ITGB2","CD34","SELL","FGF1","CD44","FGF2","CD44","COL8A1","ITGA1","LGALS3","LAG3","NRG1","ERBB3","TNFSF14","TNFRSF14")
length(LRgenes)

gsmap_genes_mel4<-c("GTF2F2","WDFY1","PBRM1","BRD4","HGS","WDR82","DPP9","CBX3","RABGGTB","POLR2G","CLPTM1L","PSMB2","MED15","UBE2R2","APH1A","TBCD","PSMA1","CDK4","CSNK1D","CCT5","DDX24","VPS29","OAS1","M6PR","EFTUD2","CNDP2","FKBP3","PSMD4","MLLT10","DDX41","SUPT16H","SPPL2A","ATOX1","SEPTIN9","MXD4","EIF3B","SAP30BP","SLC11A2","STAT2","ATP6V0D1","VMP1","UBQLN1","KPNB1","BANF1","ACIN1","AKT2","SLTM","NARF","CIAO1","NPEPL1")


In [None]:
# Get all genes that are NOT in the two lists
excluded_genes <- union(genes_to_stratify, LRgenes)
excluded_genes2<-union(gsmap_genes_mel4,gsmap_common_mel)
excluded_genes<-union(excluded_genes,excluded_genes2)
#excluded_genes<-union(excluded_genes,gsmap_genes_mel4)

available_genes <- setdiff(rownames(matrix_vst), excluded_genes)

# Randomly select 50 genes from the remaining ones
#set.seed(123)  # For reproducibility
set.seed(18090909)
random_genes <- sample(available_genes, 50)

# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the randomly selected genes
filtered <- combined %>%
  filter(gene_id %in% random_genes)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)

pdf("Skin_Melanoma_TCGA/TCGA_mel_random_exluding_DE_LR_gsmap.pdf", width = 8, height = 9)
# Plot Kaplan-Meier curve
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red
dev.off()

ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red


In [None]:
available_genes <- setdiff(rownames(matrix_vst), gsmap_common_mel)#gsmap_genes_mel4,gsmap_common_mel

# Randomly select 50 genes from the remaining ones
#set.seed(123)  # For reproducibility
set.seed(0)
random_genes <- sample(available_genes, 1)

# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the randomly selected genes
filtered <- combined %>%
  filter(gene_id %in% random_genes)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)


ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red


In [None]:
# Ensure genes_to_stratify exist in matrix_vst
filtered <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id) %>%
  filter(gene_id %in% LRgenes) %>%
  spread(key = "gene_id", value = "counts")

# Merge with combined dataframe
combined <- merge(combined, filtered, by.x = "case_id", by.y = "case_id", all.x = TRUE)

combined <- combined %>%
  mutate(status = ifelse(strata == "HIGH", 1, 0))
# Fit Cox proportional hazards model for Gene1
formula_string <- paste("Surv(overall_survival, status) ~", paste(LRgenes, collapse = " + "))
cox_fit <- coxph(as.formula(formula_string), data = combined)

# Print results
summary(cox_fit)

In [None]:
# Get all genes that are NOT in the two lists
#excluded_genes <- union(genes_to_stratify, LRgenes)
#available_genes <- setdiff(rownames(matrix_vst), excluded_genes)

# Randomly select 50 genes from the remaining ones
set.seed(0000000)  # For reproducibility
random_genes <- sample(available_genes, 50)

# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the randomly selected genes
filtered <- combined %>%
  filter(gene_id %in% random_genes)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)

# Plot Kaplan-Meier curve
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red


In [None]:
# Get all genes that are NOT in the two lists
excluded_genes <- union(genes_to_stratify, LRgenes)
available_genes <- setdiff(rownames(matrix_vst), excluded_genes)

# Randomly select 50 genes from the remaining ones
set.seed(100)  # For reproducibility
random_genes <- sample(available_genes, 50)

# Convert the matrix to a tidy dataframe
combined <- matrix_vst %>%
  as.data.frame() %>%
  rownames_to_column(var = "gene_id") %>%
  gather(key = "case_id", value = "counts", -gene_id)

# Filter for the randomly selected genes
filtered <- combined %>%
  filter(gene_id %in% random_genes)

# Compute **mean** expression per sample
mean_expression_per_sample <- filtered %>%
  group_by(case_id) %>%
  summarise(mean_expression = mean(counts, na.rm = TRUE))

# Compute the median expression across all samples
median_expression <- median(mean_expression_per_sample$mean_expression, na.rm = TRUE)

# Assign strata based on median expression
mean_expression_per_sample <- mean_expression_per_sample %>%
  mutate(strata = ifelse(mean_expression >= median_expression, "HIGH", "LOW"))

# Merge clinical data
combined <- merge(mean_expression_per_sample, coldata_subset, by.x = "case_id", by.y = "submitter_id")

# Perform survival analysis
survdiff_res <- survdiff(Surv(overall_survival) ~ strata, data = combined)

# Fit survival model
fit <- survfit(Surv(overall_survival) ~ strata, data = combined)


pdf("Skin_Melanoma_TCGA/TCGA_mel_random_exluding_DE_LR.pdf", width = 8, height = 9)
# Plot Kaplan-Meier curve
ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red
dev.off()

ggsurvplot(fit,
           data = combined,
           pval = TRUE,
           risk.table = TRUE,
           legend.title = "Strata",
           legend.labs = c("LOW", "HIGH"),
           palette = c("blue", "red"))  # LOW = blue, HIGH = red
