In [19]:
from rna_seq_normalization import Normalization as Norm
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler

In [20]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

Define the project names, the path and the files with the raw tables of counts

In [21]:
DIR="/users/genomics/marta/BLCA"
projects = ['HdM-BLCA-1','SNY-2017','IMvigor210','UC-GENOME','UNC-108']
projects_counts_genenames = {'HdM-BLCA-1':'/users/genomics/marta/BLCA/HdM-BLCA-1/analysis/01_counts/CountsTable_genename_20patients.txt',
                             'SNY-2017':'/users/genomics/marta/BLCA/SNY-2017/analysis/01_counts/CountsTable_genename.txt',
                             'IMvigor210':'/users/genomics/marta/BLCA/IMvigor210/analysis/01_counts/CountsTable_genename_346.txt',
                             'UC-GENOME':'/users/genomics/sergiov/UC-Genome/Count_files/gene_names_counts.txt'}

projects_patients = {'HdM-BLCA-1':'/users/genomics/marta/BLCA/HdM-BLCA-1/patients_response.csv',
                     'SNY-2017':'/users/genomics/marta/BLCA/SNY-2017/patients_response.csv',
                     'IMvigor210':'/users/genomics/marta/BLCA/IMvigor210/patients_response.csv',
                     'UC-GENOME':'/users/genomics/marta/BLCA/UC-GENOME/patients_response.csv',
                     'UNC-108':'/users/genomics/marta/BLCA/UNC-108/patients_response.csv'}

Create a dictionary to later on convert patient codes to the desired patient ids

In [22]:
## UC-GENOME
to_change_names_UCG = pd.read_csv("/users/genomics/marta/BLCA/UC-GENOME/patients_run_response.csv")
to_change_names_UCG.drop("Response", axis=1, inplace=True)
to_change_names_UCG_dict = dict(zip(to_change_names_UCG.Run, to_change_names_UCG.patient))

In [23]:
## SNYDER
to_change_names_SNY = pd.read_csv("/users/genomics/marta/BLCA/SNY-2017/patients_index_response.csv")
to_change_names_SNY.drop("Response", axis=1, inplace=True)
to_change_names_SNY_dict = dict(zip(to_change_names_SNY['index'], to_change_names_SNY.patient))
to_change_names_SNY_dict = {str(key): value for key, value in to_change_names_SNY_dict.items()}

### Convert counts to TPMs

In [None]:
# clean headers
for proj, counts in projects_counts_genenames.items():
    print(proj)
    # read patients info
    patients = pd.read_csv(projects_patients[proj])
    patients_list = patients.patient.values.tolist()
    patients_list = [str(item) for item in patients_list]
    print(len(patients_list), " patients")
    
    # import the raw table of counts
    tableofcounts = pd.read_csv(counts, sep="\t", comment='#')
    tableofcounts.rename(columns={'Geneid':'gene_name'}, inplace=True)

    # columns to modify (corresponding to patients)
    filter_col = [col for col in tableofcounts if col.startswith('/')]

    if proj == "IMvigor210":
        for col in filter_col:
            # get only patient code
            if '10240' in col or '10304' in col or '10280' in col or '10129' in col or '10265' in col or '10353' in col:
                new_col=col.split("BAMs/")[1]
            else:
                new_col=col.split("BAM_FILES/")[1]
            new_col=new_col.split("Aligned")[0]
            # keep only columns corresponding to patients with immunotherapy response
            if str(new_col) in patients_list:
                # rename the column only with the patient id
                tableofcounts.rename(columns={col:str(new_col)}, inplace=True)
            # if the patient is not in the dictionary, it means we do not have information about the response
            else:
                tableofcounts.drop(col, axis=1, inplace=True)
    elif proj == "UC-GENOME":
        for col in filter_col:
            # get only patient code
            new_col=col.split("BAM_files/")[1]
            new_col=new_col.split("/Aligned")[0]
            # rename the column only with the patient id
            if new_col in to_change_names_UCG_dict.keys():
                tableofcounts.rename(columns={col:to_change_names_UCG_dict[new_col]}, inplace=True)
            # if the patient is not in the dictionary, it means we do not have information about the response
            else:
                tableofcounts.drop(col, axis=1, inplace=True)
    elif proj == "SNY-2017":
        for col in filter_col:
            # get only patient code
            new_col = col.split("BAM_files/")[1]
            new_col = new_col.split(".bam")[0]
            new_col = str(new_col)
            # rename the column only with the patient id
            if new_col in to_change_names_SNY_dict.keys():
                tableofcounts.rename(columns={col:to_change_names_SNY_dict[str(new_col)]}, inplace=True)
            # if the patient is not in the dictionary, it means we do not have information about the response
            else:
                tableofcounts.drop(col, axis=1, inplace=True)
    tableofcounts.to_csv(os.path.join(DIR,proj,"analysis/01_counts/counts_genename_clean.txt"), sep="\t", index=False)
    print(len(tableofcounts.columns)-6)
    # get TPMs
    length = tableofcounts['Length']
    genes = tableofcounts['gene_name']
    info = tableofcounts[["gene_name","Chr","Start","End","Strand","Length"]]
    # we are only interested in the columns with counts
    counts = tableofcounts
    counts.drop(["Chr","Start","End","Strand","Length","gene_name"],axis=1, inplace=True)
    # calculate TPMs
    tpm_df = Norm.tpm(counts, length)
    columnames = tpm_df.columns.tolist()
    try:
        columnames_clean = [item.split("_")[0] for item in columnames]
        tpm_df.columns = columnames_clean
    except:
        continue
    # add gene_names again
    tpms = pd.concat([genes,tpm_df], axis=1)
    tpms.to_csv(os.path.join(DIR,proj,"analysis/01_counts/TPMs_genenames.csv"), index=False)

    # add all info apart from gene_names
    tpms_w_info = pd.concat([info,tpm_df], axis=1)
    tpms_w_info.to_csv(os.path.join(DIR,proj,"analysis/01_counts/TPMs_genenames_whole_information.csv"), index=False)
    print(len(tpms_w_info.columns))


In [72]:
#UNC-108
tableofcounts = pd.read_csv("/datasets/sergio/UNC-108/3_Normalized_Counts/GSE176307_BACI_tpm_gene.matrix.tsv", sep="\t", comment='#')
tableofcounts.rename(columns={ tableofcounts.columns[0]: "gene_name" }, inplace = True)

patients = pd.read_csv(projects_patients['UNC-108'])
# get the ones we have immunotherapy response data and RNA-Seq 
tableofcounts_patients = tableofcounts.columns
colnames = intersection(patients.patient.values.tolist(), tableofcounts_patients)
colnames.insert(0, 'gene_name')

tableofcounts[colnames].to_csv(os.path.join("/users/genomics/marta/BLCA/UNC-108/analysis/01_counts/TPMs_genenames.csv"), index=False)


### Z-score normalization of gene expression

In [26]:
def z_score(X, y=None):
    """
    Performs z-score normalization
    :param X: dataframe
    :param y: target variable, none as default
    :return: Returns the dataframe scaled using z-score normalization
    """
    scaler = StandardScaler()

    columns = X.columns
    index = X.index
    X = pd.DataFrame(scaler.fit_transform(X, y))
    X.columns = columns
    X.index = index

    return X

In [27]:
for proj in projects:
    print(proj)
    # import TPMs
    TPM = pd.read_csv(os.path.join(DIR,proj,"analysis/01_counts/TPMs_genenames.csv"))
    TPM_genes = TPM[['gene_name']]

    # get only values, without gene_name column
    TPM.drop(['gene_name'], axis=1, inplace=True)
    
    # compute z-scores from transposed matrix because the normalization is done by column
    zscore = z_score(TPM.T)
    zscore.columns = TPM_genes.gene_name.values.tolist()
    
    zscore.to_csv(os.path.join(DIR,proj,"analysis/01_counts/zscores_genenames.csv"))
    
    # make column with patients and move it to the first one
    transposed_zscore = zscore.T
    transposed_zscore['gene_name'] = transposed_zscore.index
    transposed_zscore.to_csv(os.path.join(DIR,proj,"analysis/01_counts/zscores_geneasrows_patientsascols.csv"), index=None)


HdM-BLCA-1
SNY-2017
IMvigor210
UC-GENOME
UNC-108
