In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in expression data
RNA_dat = pd.read_csv("~/Desktop/TCGA.HNSC.expression.txt", sep="\t")

In [6]:
# Remove normal samples by extracting the fourth element from the sample_id split by '-'
# Note: In R, columns are 1-indexed; here, we access index 3 (zero-indexed)
sample_encode = RNA_dat['sample_id'].str.split('-').str[3]

normal_samples = ~sample_encode.str.contains("01")
tumor_samples = ~sample_encode.str.contains("11")

RNA_normal = RNA_dat[normal_samples]
RNA_tumor = RNA_dat[tumor_samples]

In [14]:
# Log2(x+1) transformation to reduce skewness
expr_columns = RNA_tumor.columns[2:]
RNA_log_normal = RNA_normal.copy()
RNA_log_tumor = RNA_tumor.copy()
RNA_log_all = RNA_dat.copy()
# Apply log transformation to the expression columns
RNA_log_normal[expr_columns] = RNA_normal.iloc[:, 2:].applymap(lambda x: np.log2(x + 1))
RNA_log_normal.to_csv("~/Desktop/TCGA.HNSC.expression_log_normal.txt", sep="\t", index=False)

RNA_log_tumor[expr_columns] = RNA_tumor.iloc[:, 2:].applymap(lambda x: np.log2(x + 1))
RNA_log_tumor.to_csv("~/Desktop/TCGA.HNSC.expression_log_tumor.txt", sep="\t", index=False)

RNA_log_all[expr_columns] = RNA_dat.iloc[:, 2:].applymap(lambda x: np.log2(x + 1))
RNA_log_all.to_csv("~/Desktop/TCGA.HNSC.expression_log_all.txt", sep="\t", index=False)

In [12]:
# Z-score normalization by gene (i.e., column-wise normalization)
RNA_norm_normal = RNA_log_normal.copy()
RNA_norm_tumor = RNA_log_tumor.copy()
RNA_norm_all = RNA_log_all.copy()

RNA_norm_normal[expr_columns] = RNA_log_normal[expr_columns].apply(lambda x: (x - x.mean()) / x.std(), axis=0)
RNA_norm_tumor[expr_columns] = RNA_log_tumor[expr_columns].apply(lambda x: (x - x.mean()) / x.std(), axis=0)
RNA_norm_all[expr_columns] = RNA_log_all[expr_columns].apply(lambda x: (x - x.mean()) / x.std(), axis=0)

RNA_norm_normal.to_csv("~/Desktop/TCGA.HNSC.expression_log_zscore_normal.txt", sep="\t", index=False)
RNA_norm_tumor.to_csv("~/Desktop/TCGA.HNSC.expression_log_zscore_tumor.txt", sep="\t", index=False)
RNA_norm_all.to_csv("~/Desktop/TCGA.HNSC.expression_log_zscore_all.txt", sep="\t", index=False)