In [1]:
import pandas as pd
import re
import numpy as np
import json
PATH_GTF_FILE = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/gencode.v47.annotation.gtf"
PATH_EDGE_FILE = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/GeneProcessedData/9606.protein.links.v12.0.ENSG.txt"
PATH_ORDER_GENE = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/GeneProcessedData/gene_variance_order_tpm_unstranded.json"

def remove_version(x):
    if '.' in x:
        return x.split('.')[0]
    return x

def read_gtf_file(gtf_file_path, edge_file_path, variance_order_list_path, number_of_nodes):
    # We read the GTF file and the edge file, to keep only the gene that we have there.
    gtf = pd.read_csv(gtf_file_path, sep="\t", header=None, comment='#')
    gtf.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

    parameters = ['gene_id', 'gene_name', 'gene_type']
    for p in parameters:
        gtf[p] = gtf['attribute'].apply(lambda x: re.findall(rf'{p} "([^"]*)"', x)[0] if rf'{p} "' in x else np.nan)

    gtf.drop('attribute', axis=1, inplace=True)
    gtf['gene_id'] = gtf['gene_id'].apply(remove_version)
    gtf_pc = gtf[gtf['gene_type'] == 'protein_coding']

    # Protein coding set
    pc_set = set(gtf_pc['gene_id'].to_list())
    print(f"\n\tProtein coding dim: {len(pc_set)}")

    accepted_gene = set()
    with open(edge_file_path, 'r') as file:
        for line in file:
            # print(row_index)
            f = line.split(" ")[0]
            s = line.split(" ")[1]
            accepted_gene.add(f)
            accepted_gene.add(s)

    print(f"\tAccepted gene dim: {len(accepted_gene)}")

    pc_set = pc_set.intersection(accepted_gene)

    print(f"\tIntersection with accepted gene dim: {len(pc_set)}")

    # Take only the first n nodes in order of variance.
    # with open(variance_order_list_path, 'r') as file:
    #     list_of_nodes = json.load(file)

    # pc_set = pc_set.intersection(set(list_of_nodes[:number_of_nodes]))

    print(f"\tIntersection dim: {len(pc_set)}\n\t\tExecution time: ", end="")
    return gtf_pc, pc_set

gtf_pc, pc_set = read_gtf_file(PATH_GTF_FILE, PATH_EDGE_FILE, PATH_ORDER_GENE, 6000)

  from pandas.core import (



	Protein coding dim: 20092
	Accepted gene dim: 19179
	Intersection with accepted gene dim: 19158
	Intersection dim: 19158
		Execution time: 

In [9]:
gtf_pc

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_name,gene_type
2486,chr1,HAVANA,gene,65419,71585,.,+,.,ENSG00000186092,OR4F5,protein_coding
2487,chr1,HAVANA,transcript,65419,71585,.,+,.,ENSG00000186092,OR4F5,protein_coding
2488,chr1,HAVANA,exon,65419,65433,.,+,.,ENSG00000186092,OR4F5,protein_coding
2489,chr1,HAVANA,exon,65520,65573,.,+,.,ENSG00000186092,OR4F5,protein_coding
2490,chr1,HAVANA,CDS,65565,65573,.,+,0,ENSG00000186092,OR4F5,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...
4105474,chrM,ENSEMBL,gene,14747,15887,.,+,.,ENSG00000198727,MT-CYB,protein_coding
4105475,chrM,ENSEMBL,transcript,14747,15887,.,+,.,ENSG00000198727,MT-CYB,protein_coding
4105476,chrM,ENSEMBL,exon,14747,15887,.,+,.,ENSG00000198727,MT-CYB,protein_coding
4105477,chrM,ENSEMBL,CDS,14747,15887,.,+,0,ENSG00000198727,MT-CYB,protein_coding


In [2]:
# Rimuovi righe duplicate su gene_id e gene_name
gtf_pc = gtf_pc.drop_duplicates(subset=['gene_id', 'gene_name'])
gtf_pc

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_name,gene_type
2486,chr1,HAVANA,gene,65419,71585,.,+,.,ENSG00000186092,OR4F5,protein_coding
6557,chr1,HAVANA,gene,450740,451678,.,-,.,ENSG00000284733,OR4F29,protein_coding
7172,chr1,HAVANA,gene,685716,686654,.,-,.,ENSG00000284662,OR4F16,protein_coding
8430,chr1,HAVANA,gene,923923,944575,.,+,.,ENSG00000187634,SAMD11,protein_coding
8735,chr1,HAVANA,gene,944203,959309,.,-,.,ENSG00000188976,NOC2L,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...
4105439,chrM,ENSEMBL,gene,10470,10766,.,+,.,ENSG00000212907,MT-ND4L,protein_coding
4105446,chrM,ENSEMBL,gene,10760,12137,.,+,.,ENSG00000198886,MT-ND4,protein_coding
4105460,chrM,ENSEMBL,gene,12337,14148,.,+,.,ENSG00000198786,MT-ND5,protein_coding
4105466,chrM,ENSEMBL,gene,14149,14673,.,-,.,ENSG00000198695,MT-ND6,protein_coding


In [3]:
import pandas as pd
import re
import numpy as np
import json
import torch
import os
import time

class LPDEdgeKnowledgeBased:
    def __init__(self, gtf_file_path: str, folder_gene_path: str, folder_methylation_path:str,
                 folder_copy_number_path:str, case_id_json_path: str, methylation_converter_file_path:str,
                 test_file_case_id_path: str, train_file_case_id_path: str, edge_file_path: str,
                 variance_order_list_path: str,
                 feature_to_save_dict: dict, number_of_nodes: str):
        
        # Path variable
        self.gtf_file_path = gtf_file_path
        self.folder_gene_path = folder_gene_path
        self.folder_methylation_path = folder_methylation_path
        self.folder_copy_number_path = folder_copy_number_path
        self.case_id_json_path = case_id_json_path
        self.methylation_converter_file_path = methylation_converter_file_path
        self.test_file_case_id_path = test_file_case_id_path
        self.train_file_case_id_path = train_file_case_id_path
        self.edge_file_path = edge_file_path
        self.variance_order_list_path = variance_order_list_path

        # Data to create dataset
        self.feature_to_save_dict = feature_to_save_dict
        self.number_of_nodes = number_of_nodes

    def measure_time(func):
        def wrapper(self, *arg, **kw):
            start_time = time.time()
            ret = func(self, *arg, **kw)
            print(f"\t\t{np.floor(time.time() - start_time)}s")
            return ret
        return wrapper
    
    def remove_version(self, x):
        if '.' in x:
            return x.split('.')[0]
        return x

    @measure_time
    def read_gtf_file(self):
        # We read the GTF file and the edge file, to keep only the gene that we have there.
        gtf = pd.read_csv(self.gtf_file_path, sep="\t", header=None, comment='#')
        gtf.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

        parameters = ['gene_id', 'gene_type', 'gene_name']
        for p in parameters:
            gtf[p] = gtf['attribute'].apply(lambda x: re.findall(rf'{p} "([^"]*)"', x)[0] if rf'{p} "' in x else np.nan)

        gtf.drop('attribute', axis=1, inplace=True)
        gtf['gene_id'] = gtf['gene_id'].apply(self.remove_version)
        gtf_pc = gtf[gtf['gene_type'] == 'protein_coding']

        # Protein coding set
        self.pc_set = set(gtf_pc['gene_id'].to_list())
        print(f"\n\tProtein coding dim: {len(self.pc_set)}")

        accepted_gene = set()
        with open(self.edge_file_path, 'r') as file:
            for line in file:
                # print(row_index)
                f = line.split(" ")[0]
                s = line.split(" ")[1]
                accepted_gene.add(f)
                accepted_gene.add(s)

        print(f"\tAccepted gene dim: {len(accepted_gene)}")

        self.pc_set = self.pc_set.intersection(accepted_gene)

        print(f"\tIntersection with accepted gene dim: {len(self.pc_set)}")

        # Take only the first n nodes in order of variance.
        # with open(self.variance_order_list_path, 'r') as file:
        #     list_of_nodes = json.load(file)

        # self.pc_set = self.pc_set.intersection(set(list_of_nodes[:self.number_of_nodes]))

        print(f"\tIntersection dim: {len(self.pc_set)}\n\t\tExecution time: ", end="")

    @measure_time
    def preprocessing_gene(self):
        with open(self.case_id_json_path, 'r') as file:
            file_parsed = json.load(file)
        file_to_case_id = dict((file_parsed[k]['files']['gene'], k) for k in file_parsed.keys())
        file_to_os = dict((file_parsed[k]['files']['gene'], file_parsed[k]['os']) for k in file_parsed.keys())

        self.datastructure_gene = pd.DataFrame(columns=['case_id', 'os', 'values'])
        feature_to_save = self.feature_to_save_dict['gene']

        index = 0
        # Now explore data path to get the right files
        for root, dirs, files in os.walk(self.folder_gene_path):
            for dir in dirs:
                for root, dirs, files in os.walk(self.folder_gene_path + "/" + dir):
                    for file in files:
                        if file in file_to_case_id.keys():
                            parsed_file = pd.read_csv(self.folder_gene_path + "/" + dir + "/" + file,
                                                    sep='\t', header=0, skiprows=lambda x: x in [0, 2, 3, 4, 5])
                            parsed_file = parsed_file[['gene_id'] + feature_to_save]
                            # Now specify columns type.
                            convert_dict = dict([(k, float) for k in feature_to_save])
                            convert_dict['gene_id'] = str
                            parsed_file = parsed_file.astype(convert_dict)
                            parsed_file['gene_id'] = parsed_file['gene_id'].apply(self.remove_version)
                            parsed_file = parsed_file[parsed_file['gene_id'].isin(self.pc_set)]

                            self.datastructure_gene.loc[index] = [
                                file_to_case_id[file],
                                file_to_os[file],
                                parsed_file
                            ]
                            index += 1

        # Apply log.
        for i in range(self.datastructure_gene.shape[0]):
            self.datastructure_gene['values'].loc[i][feature_to_save] = self.datastructure_gene['values'].loc[i][feature_to_save].applymap(lambda x: np.log10(x + 0.01))
        
        # Make value in a [0, 1] range.
        for r in range(self.datastructure_gene.shape[0]):
            for c in feature_to_save:
                self.datastructure_gene['values'].loc[r][c] =    (self.datastructure_gene['values'].loc[r][c] - self.datastructure_gene['values'].loc[r][c].min()) / \
                                                                (self.datastructure_gene['values'].loc[r][c].max() - self.datastructure_gene['values'].loc[r][c].min())
    
    def convert_methylation_to_gene(self, methylation_id, conversion_dict):
        return conversion_dict.get(methylation_id, None)

    def preprocessing_methylation(self):
        # Load the file path dictionary
        with open(self.case_id_json_path, 'r') as file:
            file_parsed = json.load(file)

        # Create dictionaries for case_id and os
        file_to_case_id = {file_parsed[k]['files']['methylation']: k for k in file_parsed.keys()}
        file_to_os = {file_parsed[k]['files']['methylation']: file_parsed[k]['os'] for k in file_parsed.keys()}

        # Initialize the DataFrame
        self.datastructure_methylation = pd.DataFrame(columns=['case_id', 'os','values'])
        feature_to_save = self.feature_to_save_dict['methylation']
        index = 0
        for root, dirs, files in os.walk(self.folder_methylation_path):
            for dir in dirs:
                for root, dirs, files in os.walk(os.path.join(self.folder_methylation_path, dir)):
                    for file in files:
                        if file in file_to_case_id.keys():
                            parsed_file = pd.read_csv(os.path.join(self.folder_methylation_path, dir, file),
                                                      sep='\t', header=None, names=["id", "methylation"])

                            convert_dict = dict([(k, float) for k in feature_to_save])
                            convert_dict['id'] = str
                            parsed_file = parsed_file.astype(convert_dict)
                            parsed_file = parsed_file.dropna()
                            # Add the data to the DataFrame
                            self.datastructure_methylation.loc[index] = [
                                file_to_case_id[file],
                                file_to_os[file],
                                parsed_file
                            ]
                            index += 1

        # Carica il file di conversione
        conversion_df = pd.read_csv(self.methylation_converter_file_path, dtype = {'gene_id': str, 'gene_chr': str, 'gene_strand': str, 'gene_start': str, 'gene_end': str, 'cpg_island': str, 'cpg_IlmnID': str, 'cpg_chr': str})
        # Crea un dizionario per la conversione rapida
        conversion_dict = pd.Series(conversion_df.gene_id.values, index=conversion_df.cpg_IlmnID).to_dict()
        # Crea una nuova colonna 'gene_id' nel DataFrame
        number_of_duplicate_list = []
        for i in range(self.datastructure_methylation.shape[0]):
            self.datastructure_methylation['values'].iloc[i]['gene_id'] = self.datastructure_methylation['values'].iloc[i]['id'].apply(lambda x: self.convert_methylation_to_gene(x, conversion_dict))
            self.datastructure_methylation.at[i, 'values'] = self.datastructure_methylation.at[i, 'values'].drop(columns=['id'])
            self.datastructure_methylation.at[i, 'values'] = self.datastructure_methylation.at[i, 'values'][
                self.datastructure_methylation.at[i, 'values']['gene_id'].isin(self.pc_set)
            ]
            
            number_of_duplicate_list.append(len([v for v in self.datastructure_methylation['values'].loc[i]['gene_id'].duplicated() if v == True]))
            self.datastructure_methylation.at[i, 'values'] = self.datastructure_methylation.at[i, 'values'].drop_duplicates(subset=['gene_id'])
            assert self.datastructure_methylation['values'].loc[i]['gene_id'].duplicated().any() == False
        print("")
        print("\t\tNumber of duplicate gene:")
        print(f"\t\t\tmin: {min(number_of_duplicate_list)}")
        print(f"\t\t\tmax: {max(number_of_duplicate_list)}")
        print(f"\t\t\tavg: {0 if len(number_of_duplicate_list) == 0 else sum(number_of_duplicate_list)/len(number_of_duplicate_list)}")

        # Make value in a [0, 1] range.
        for r in range(self.datastructure_methylation.shape[0]):
            for c in feature_to_save:
                self.datastructure_methylation['values'].loc[r][c] =   (self.datastructure_methylation['values'].loc[r][c] - self.datastructure_methylation['values'].loc[r][c].min()) / \
                                                                (self.datastructure_methylation['values'].loc[r][c].max() - self.datastructure_methylation['values'].loc[r][c].min())        

    @measure_time
    def preprocessing_copy_number(self):
        with open(self.case_id_json_path, 'r') as file:
            file_parsed = json.load(file)
        file_to_case_id = dict((file_parsed[k]['files']['copy_number'], k) for k in file_parsed.keys())
        file_to_os = dict((file_parsed[k]['files']['copy_number'], file_parsed[k]['os']) for k in file_parsed.keys())
        self.datastructure_copy_number = pd.DataFrame(columns=['case_id', 'os', 'values'])
        feature_to_save = self.feature_to_save_dict['copy_number']
        index = 0
        # Now explore data path to get the right files
        for root, dirs, files in os.walk(self.folder_copy_number_path):
            for dir in dirs:
                for root, dirs, files in os.walk(self.folder_copy_number_path + "/" + dir):
                    for file in files:
                        if file in file_to_case_id.keys():
                            parsed_file = pd.read_csv(self.folder_copy_number_path + "/" + dir + "/" + file, sep='\t')
                            parsed_file = parsed_file[['gene_id'] + feature_to_save]
                            convert_dict = dict([(k, float) for k in feature_to_save])
                            convert_dict['gene_id'] = str
                            parsed_file = parsed_file.astype(convert_dict)
                            parsed_file['gene_id'] = parsed_file['gene_id'].apply(self.remove_version)
                            parsed_file = parsed_file[parsed_file['gene_id'].isin(self.pc_set)].fillna(0)
                            self.datastructure_copy_number.loc[index] = [
                                file_to_case_id[file],
                                file_to_os[file],
                                parsed_file
                            ]
                            index += 1
        # Make value in a [0, 1] range.
        for r in range(self.datastructure_copy_number.shape[0]):
            for c in feature_to_save:
                self.datastructure_copy_number['values'].loc[r][c] =    (self.datastructure_copy_number['values'].loc[r][c] - self.datastructure_copy_number['values'].loc[r][c].min()) / \
                                                                        (self.datastructure_copy_number['values'].loc[r][c].max() - self.datastructure_copy_number['values'].loc[r][c].min())

    @measure_time
    def datastructure_merge_func(self):
        self.datastructure_merge = pd.DataFrame(columns=['case_id', 'os', 'values'])
        Number_of_miss_case_id_methylation = 0
        Number_of_miss_case_id_copy_number = 0
        Number_of_miss_on_both = 0
        merge_index = 0
        final_number_of_node = []
        for index in range(self.datastructure_gene.shape[0]):
            curr_case_id = self.datastructure_gene['case_id'].loc[index]
            curr_gene_datastructure = self.datastructure_gene[self.datastructure_gene['case_id'] == curr_case_id]
            curr_methylation_datastructure = self.datastructure_methylation[self.datastructure_methylation['case_id'] == curr_case_id]
            curr_copy_number_datastructure = self.datastructure_copy_number[self.datastructure_copy_number['case_id'] == curr_case_id]

            assert curr_gene_datastructure.shape[0] == 1
            assert curr_methylation_datastructure.shape[0] <= 1
            assert curr_copy_number_datastructure.shape[0] <= 1

            if curr_methylation_datastructure.shape[0] == 0:
                if curr_copy_number_datastructure.shape[0] == 0:
                    Number_of_miss_on_both += 1
                    continue
                else:
                    Number_of_miss_case_id_methylation += 1
            if curr_copy_number_datastructure.shape[0] == 0:
                Number_of_miss_case_id_copy_number += 1

            merged_value =  curr_gene_datastructure['values'].iloc[0].merge(curr_copy_number_datastructure['values'].iloc[0], on='gene_id', how='inner')
            merged_value = merged_value.merge(curr_methylation_datastructure['values'].iloc[0], on='gene_id', how='inner')
            final_number_of_node.append(merged_value.shape[0])
                                                                    
            self.datastructure_merge.loc[merge_index] = [
                curr_case_id,
                curr_gene_datastructure['os'],
                merged_value
            ]
            merge_index += 1

        print("")
        print("\t\tNumber of final node:")
        print(f"\t\t\tmin: {min(final_number_of_node)}")
        print(f"\t\t\tmax: {max(final_number_of_node)}")
        print(f"\t\t\tavg: {0 if len(final_number_of_node) == 0 else sum(final_number_of_node)/len(final_number_of_node)}")
        print(f"\t\tNumber of case_id miss due to methylation: {Number_of_miss_case_id_methylation}")
        print(f"\t\tNumber of case_id miss due to copy number: {Number_of_miss_case_id_copy_number}")
        print(f"\t\tNumber of case_id miss due to both: {Number_of_miss_case_id_copy_number}")
        
        return self.datastructure_merge

    def get_data(self):
        print("Read GTF file\t", end="")
        self.read_gtf_file()
        print("Start preprocessing Methylation", end="")
        self.preprocessing_methylation()
        print("Start preprocessing Copy Number", end="")
        self.preprocessing_copy_number()
        print("Start preprocessing Gene", end="")
        self.preprocessing_gene()
        print("Start merge", end="")
        data_merged = self.datastructure_merge_func()

        return data_merged

In [5]:
# Load data path
PATH_GTF_FILE = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/gencode.v47.annotation.gtf"
PATH_FOLDER_GENE = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/GeneExpression"
PATH_FOLDER_COPY_NUMBER = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/CopyNumber"
PATH_FOLDER_METHYLATION = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/Methylation"
# PATH_CASE_ID_STRUCTURE = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/case_id_and_structure.json"
PATH_CASE_ID_STRUCTURE = "/homes/dlupo/Progetto_BioInformatics/AI_for_Bioinformatics_Project/GGNN_main/case_id_and_structure_ALL.json"


PATH_METHYLATION_CONVERTER = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/matched_cpg_genes.csv"

# For edge similarity files.
PATH_EDGE_FILE = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/GeneProcessedData/9606.protein.links.v12.0.ENSG.txt"
# Order of nodes files.
PATH_ORDER_GENE = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/GeneProcessedData/gene_variance_order_tpm_unstranded.json"
# Test and Train separation file.
PATH_TEST_CLASS = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/GeneProcessedData/test_separation_2_classes.json"
PATH_TRAIN_CLASS = "/work/h2020deciderficarra_shared/TCGA/OV/project_n16_data/GeneProcessedData/train_separation_2_classes.json"
hyperparameter = {
    'feature_to_save':{
        'gene': ['unstranded', 'tpm_unstranded', 'fpkm_unstranded', 'fpkm_uq_unstranded'],
        'methylation': ['methylation'],
        'copy_number': ['copy_number']
        },
    'num_nodes': 6500
}
lpd = LPDEdgeKnowledgeBased(PATH_GTF_FILE, PATH_FOLDER_GENE, PATH_FOLDER_METHYLATION,
                                PATH_FOLDER_COPY_NUMBER, PATH_CASE_ID_STRUCTURE, PATH_METHYLATION_CONVERTER,
                                PATH_TEST_CLASS, PATH_TRAIN_CLASS, PATH_EDGE_FILE, 
                                PATH_ORDER_GENE, 
                                hyperparameter['feature_to_save'], hyperparameter['num_nodes'])
data_merged = lpd.get_data()  # List of Data.

Read GTF file	
	Protein coding dim: 20092
	Accepted gene dim: 19179
	Intersection with accepted gene dim: 19158
	Intersection dim: 19158
		Execution time: 		55.0s
Start preprocessing Methylation
		Number of duplicate gene:
			min: 345
			max: 420
			avg: 405.7308641975309
Start preprocessing Copy Number		103.0s
Start preprocessing Gene

  self.datastructure_gene['values'].loc[i][feature_to_save] = self.datastructure_gene['values'].loc[i][feature_to_save].applymap(lambda x: np.log10(x + 0.01))
  self.datastructure_gene['values'].loc[i][feature_to_save] = self.datastructure_gene['values'].loc[i][feature_to_save].applymap(lambda x: np.log10(x + 0.01))
  self.datastructure_gene['values'].loc[i][feature_to_save] = self.datastructure_gene['values'].loc[i][feature_to_save].applymap(lambda x: np.log10(x + 0.01))
  self.datastructure_gene['values'].loc[i][feature_to_save] = self.datastructure_gene['values'].loc[i][feature_to_save].applymap(lambda x: np.log10(x + 0.01))
  self.datastructure_gene['values'].loc[i][feature_to_save] = self.datastructure_gene['values'].loc[i][feature_to_save].applymap(lambda x: np.log10(x + 0.01))
  self.datastructure_gene['values'].loc[i][feature_to_save] = self.datastructure_gene['values'].loc[i][feature_to_save].applymap(lambda x: np.log10(x + 0.01))
  self.datastructure_gene['values'].loc[i][fea

		178.0s
Start merge
		Number of final node:
			min: 5237
			max: 6024
			avg: 5868.103703703704
		Number of case_id miss due to methylation: 0
		Number of case_id miss due to copy number: 0
		Number of case_id miss due to both: 0
		7.0s


In [6]:
gtf_mapping = gtf_pc[['gene_id', 'gene_name']]

In [7]:
for i in range(len(data_merged['values'])):
    data_merged['values'][i] = data_merged['values'][i].merge(gtf_mapping[['gene_id', 'gene_name']], on='gene_id', how='left')

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data_merged['values'][i] = data_merged['values'][i].merge(gtf_mapping[['gene_id', 'gene_name']], on='gene_id', how='left')
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the defaul

In [8]:
hprd = pd.read_csv('data/hprd.txt', header=None)
hprd_gene_names = set(hprd[0])

for i in range(len(data_merged['values'])):
    data_merged['values'][i] = data_merged['values'][i][data_merged['values'][i]['gene_name'].isin(hprd_gene_names)]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data_merged['values'][i] = data_merged['values'][i][data_merged['values'][i]['gene_name'].isin(hprd_gene_names)]


In [14]:
data_merged['values'][404]

Unnamed: 0,gene_id,unstranded,tpm_unstranded,fpkm_unstranded,fpkm_uq_unstranded,copy_number,methylation,gene_name
1,ENSG00000001167,0.574480,0.420676,0.389704,0.404265,0.222222,0.019195,NFYA
4,ENSG00000001617,0.659780,0.507610,0.481208,0.493627,0.222222,0.005535,SEMA3F
6,ENSG00000002330,0.644718,0.557628,0.533893,0.545059,0.444444,0.012597,BAD
8,ENSG00000002822,0.311535,0.083839,0.054180,0.067004,0.222222,0.006820,MAD1L1
9,ENSG00000002834,0.675468,0.501501,0.474775,0.487346,0.222222,0.006680,LASP1
...,...,...,...,...,...,...,...,...
5896,ENSG00000277443,0.627132,0.476027,0.447951,0.461155,0.222222,0.026005,MARCKS
5897,ENSG00000277462,0.557988,0.394582,0.362275,0.377457,0.666667,0.024045,ZNF670
5899,ENSG00000277791,0.730625,0.685775,0.668906,0.676843,0.222222,0.014449,PSMB3
5902,ENSG00000278195,0.000000,0.000000,0.000000,0.000000,0.222222,0.281866,SSTR3


In [15]:
import pandas as pd
from functools import reduce
import csv

# --- CONFIGURAZIONE ---
# Colonne di espressione da estrarre
expression_cols = ['unstranded', 'tpm_unstranded', 'fpkm_unstranded', 'fpkm_uq_unstranded']
# Qui scelgo 'unstranded', puoi cambiarlo
chosen_expression_col = 'unstranded'

# --- LISTE DI DATAFRAME PER PAZIENTE ---
expression_dfs = []
copy_number_dfs = []
methylation_dfs = []

for _, row in data_merged.iterrows():
    case_id = row['case_id']
    values_df = row['values']
    
    # Gene Expression (esempio con 'unstranded')
    expr_df = values_df[['gene_name', chosen_expression_col]].rename(columns={chosen_expression_col: case_id})
    expression_dfs.append(expr_df)
    
    # Copy Number
    copy_df = values_df[['gene_name', 'copy_number']].rename(columns={'copy_number': case_id})
    copy_number_dfs.append(copy_df)
    
    # Methylation
    meth_df = values_df[['gene_name', 'methylation']].rename(columns={'methylation': case_id})
    methylation_dfs.append(meth_df)

# --- MERGE SU gene_name ---
expr_merged = reduce(lambda l, r: pd.merge(l, r, on='gene_name', how='outer'), expression_dfs)
copy_merged = reduce(lambda l, r: pd.merge(l, r, on='gene_name', how='outer'), copy_number_dfs)
meth_merged = reduce(lambda l, r: pd.merge(l, r, on='gene_name', how='outer'), methylation_dfs)

# --- ORDINA ---
expr_merged = expr_merged.sort_values('gene_name').reset_index(drop=True)
copy_merged = copy_merged.sort_values('gene_name').reset_index(drop=True)
meth_merged = meth_merged.sort_values('gene_name').reset_index(drop=True)

# --- OPZIONALE: riempi NaN con 0 ---
expr_merged = expr_merged.fillna(0)
copy_merged = copy_merged.fillna(0)
meth_merged = meth_merged.fillna(0)

# --- SALVA CSV con virgolette su header/gene_id ma NON sui numeri ---
expr_merged.to_csv('data/ov_tcga4/out/RNA.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
copy_merged.to_csv('data/ov_tcga4/out/CNA.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
meth_merged.to_csv('data/ov_tcga4/out/Methyl.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
