# In out file. 
On this file I have to put all the methods related to the data processing. 
* Add metadata 
* Label stuff
* Outlier treatment !!! TODO
* Missings 
* Split

I'll make it in the jupyter and then I'll move it into a py file, once it is tested and working. 

````
Input: data_path/allfiles  + data_path/metadatos_v2.0.txt
Output: name.csv or name_train.csv, name_train_target.csv, name_test.csv, name_test_target.csv

````

In [1]:
import os
import sys
sys.path.insert(1, '../../src')

import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")


from datetime import timedelta
import time
import numpy as np
import pandas as pd
import networkx as nx

from fancyimpute import IterativeImputer
from sklearn.model_selection import train_test_split
from natsort import natsorted
from matplotlib import pyplot as plt
import gc

Using TensorFlow backend.


In [2]:
# Data paths:
DATA_PATH = '../definitive_data_folder'
PATIENTS_PATH = DATA_PATH + '/allfiles'
# The prgram will try to load the csv, if the csv does not exist it will generate it ussing the txt. 
METADATA_PATH = DATA_PATH + '/metadatos_v2.0.csv'

if not os.path.exists(METADATA_PATH):
    generate_metadata_csv()
    
OUTPUT_PATH = DATA_PATH + '/datasets'

try: os.mkdir(DATA_PATH)
except: pass
try: os.mkdir(OUTPUT_PATH)
except: pass

# Globals
labels=['ECTODERM', 'NEURAL_CREST', 'MESODERM', 'ENDODERM']
hist2 = np.array(['Biliary', 'Bladder', 'Bone/SoftTissue', 'Breast', 'CNS', 'Cervix',
       'Colon/Rectum', 'Esophagus', 'Head/Neck', 'Kidney', 'Liver',
       'Lung', 'Lymphoid', 'Myeloid', 'Ovary', 'Pancreas', 'Prostate',
       'Skin', 'Stomach', 'Thyroid', 'Uterus'])
chromosomes = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',
               '19', '20', '21', '22', 'X', 'Y']
svclass = ['DEL', 'DUP', 'TRA', 'h2hINV', 't2tINV']
k = 300
TOMMY = '43dadc68-c623-11e3-bf01-24c6515278c0'

In [3]:
def generate_metadata_csv():
    """
    This function generates a real dataset using the txt given and saves it as a csv.
    :return:
    """
    data = pd.DataFrame(
        columns=['sampleID', 'donor_sex', 'donor_age_at_diagnosis', 'histology_tier1', 'histology_tier2',
                 'tumor_stage1', 'tumor_stage2'])

    with open(METADATA_PATH.replace('.csv','.txt')) as f:
        for l in f:
            words = l.split()
            id = words[0]
            sex = words[1]
            age = words[2]
            tier1 = words[3]
            tier2 = words[4]
            tumor_stage1 = '_'.join(words[5:7])
            tumor_stage2 = '_'.join(words[8:])
            data = data.append({'sampleID': id, 'donor_sex': sex, 'donor_age_at_diagnosis': age,
                                'histology_tier1': tier1, 'histology_tier2': tier2,
                                'tumor_stage1': tumor_stage1, 'tumor_stage2': tumor_stage2}, ignore_index=True)

    data = data.drop(data.index[0])
    data.to_csv(METADATA_PATH, index=False)

In [4]:
def generateTRAGraph(patient):
    '''
    This function generates a graph per patient representing the traslocations of this patient.
    
    vertex: Chromosomes
    edge: the number of traslocations between each chromosome

    Input:
        patient(string):  The patient id.
    Output:
        graph: networkx format
        edge_list: List with the format:
                    node1 node2 weight    (edge between node1 and node2 with weight weight)
    '''
    patient_path = PATIENTS_PATH + '/'+ patient + '.vcf.tsv'
    
    # Load the patient breaks, and select only the traslocations
    patient_breaks = pd.read_csv(patient_path, sep='\t', index_col=None)
    
    # patient_breaks['chrom2'] = patient_breaks['chrom2'].map(str)

    only_TRA = patient_breaks.loc[patient_breaks['svclass'] == 'TRA']

    # The crosstab is equivalent to the adjacency matrix, so we use this to calculate it
    ct_tra = pd.crosstab(only_TRA['#chrom1'], only_TRA['chrom2'])

    ct_tra.index = ct_tra.index.map(str)
    adjacency_matrix_connected_only = ct_tra

    aux = pd.DataFrame(0,columns=chromosomes, index=chromosomes)
    aux.index = aux.index.map(str)

    ct_tra = aux.add(ct_tra,fill_value=0)
    aux = None
    # Reorder
    ct_tra = ct_tra.reindex(index=natsorted(ct_tra.index))
    ct_tra = ct_tra[chromosomes]
    # change the values to int
    ct_tra = ct_tra.astype(int)

    # Generate the adjacency matrix
    adjacency_matrix = pd.DataFrame(data=ct_tra.values,
                                columns=chromosomes, index=chromosomes)
    # print(adjacency_matrix)
    graph = nx.from_pandas_adjacency(adjacency_matrix)
    graph.to_undirected()
    
    # Remove isolated vertices 
    graph.remove_nodes_from(list(nx.isolates(graph)))

    edge_list = nx.generate_edgelist(graph,data=['weight'])
    return graph, edge_list

In [5]:
def nan_imputing(df):
    """
    There is only one feature with nans. Donor age at diagnosis. 
    We impute it using the KNN strategy
    :param df:
    :return:
    """
    # Imput missing data with mice
    fancy_imputed = df
    dummies = pd.get_dummies(df)
    imputed = pd.DataFrame(data=IterativeImputer().fit_transform(dummies), columns=dummies.columns, index=dummies.index)
    fancy_imputed.donor_age_at_diagnosis = imputed.donor_age_at_diagnosis
    fancy_imputed['donor_age_at_diagnosis'] = fancy_imputed['donor_age_at_diagnosis'].astype(np.int)
    return fancy_imputed

In [1]:
def preprocessing_without_split(X):
    # this function is only ment for data analysis
    X['donor_sex'] = X['donor_sex'].str.replace('female','1')
    X['donor_sex'] = X['donor_sex'].str.replace('male','0')

    X['female'] = pd.to_numeric(X['donor_sex'])
    X = X.drop('donor_sex',axis=1)
    # X['number_of_breaks'] = X['DUP'] + X['DEL'] + X['TRA'] + X['h2hINV'] + X['t2tINV']
    for column in X.columns:
        if 'chr' in column:
            X['proportion_' + column] = 0
            X[['proportion_' + column]] = np.true_divide(np.float32(X[[column]]),
                                                               np.float32(X[['number_of_breaks']]))

        if 'DUP' in column or 'DEL' in column or 'TRA' in column or 'h2hINV' in column or 't2tINV' in column:
            X['proportion_' + column] = 0
            X[['proportion_' + column]] = np.true_divide(np.float32(X[[column]]),
                                                               np.float32(X[['number_of_breaks']]))
    X = nan_imputing(X)
    X = pd.get_dummies(X,columns=['tumor_stage1', 'tumor_stage2'])
    return X
            
def preprocessing(df,hist1=True):
    if hist1:
        y = df.pop('histology_tier1')
        X = df.drop('histology_tier2', axis=1)
    else:
        y = df.pop('histology_tier2')
        X = df.drop('histology_tier1', axis=1)

    X['donor_sex'] = X['donor_sex'].str.replace('female','1')
    X['donor_sex'] = X['donor_sex'].str.replace('male','0')

    X['female'] = pd.to_numeric(X['donor_sex'])
    
    X = X.drop('donor_sex',axis=1)
    X_train, X_test, Y_train, Y_test = \
        train_test_split(pd.get_dummies(X), y, stratify=y, test_size=.2)
    X_train = nan_imputing(X_train)
    X_test = nan_imputing(X_test)

    for column in X_train.columns:
        if 'chr' in column:
            X_train['proportion_' + column] = 0
            X_train[['proportion_' + column]] = np.true_divide(np.float32(X_train[[column]]),
                                                               np.float32(X_train[['number_of_breaks']]))
            X_test['proportion_' + column] = 0
            X_test[['proportion_' + column]] = np.true_divide(np.float32(X_test[[column]]),
                                                              np.float32(X_test[['number_of_breaks']]))

        if 'DUP' in column or 'DEL' in column or 'TRA' in column or 'h2hINV' in column or 't2tINV' in column:
            X_train['proportion_' + column] = 0
            X_train[['proportion_' + column]] = np.true_divide(np.float32(X_train[[column]]),
                                                               np.float32(X_train[['number_of_breaks']]))
            X_test['proportion_' + column] = 0
            X_test[['proportion_' + column]] = np.true_divide(np.float32(X_test[[column]]),
                                                              np.float32(X_test[['number_of_breaks']]))
    return X_train, Y_train, X_test, Y_test

def generate_one_vs_all_datasets(Y, class_name):
    to_replace = [c for c in labels if c != class_name]
    Y_class = Y.replace(to_replace=to_replace, value='OTHER')
    return Y_class

In [25]:
def generate_dataset(name, split=True, hist1=True):
    """
    slow but u only need to run it once.
    
    connected_components
    connected_components_max_size
    """
    print 'Generating csv..'
    # load the metadata 
    metadata = pd.read_csv(METADATA_PATH)
    metadata = metadata.set_index('sampleID')
    
    # load the patient ids and remove the ones that don't have metadata.
    patients = os.listdir(PATIENTS_PATH)
    patients = [p.replace('.vcf.tsv','') for p in patients if p in list(metadata.index)]
    
    # The initial dataset is the metadata one. 
    dataset = metadata
    
    for i, patient in enumerate(metadata.index):
        # Generate the traslocation graph of the patient and the edge_list
        g, edge_list = generateTRAGraph(patient=patient)
        
        dataset.loc[patient, 'connected_components'] = len(list(nx.connected_component_subgraphs(g)))
        
        # add the max of the number of vertex of the connected components of the graph
        if len(list(nx.connected_component_subgraphs(g))) > 0:
            dataset.loc[patient, 'connected_components_max_size'] = np.max(
                [len(list(component.nodes())) for component in nx.connected_component_subgraphs(g)])
        else:
            dataset.loc[patient, 'connected_components_max_size'] = 0
        
        # add the translocations
        for edge in edge_list:
            edge = edge.split(' ')
            if edge[0]  in ['X', 'Y'] and edge[1] in ['X','Y']:
                edge_column = '(' + 'X' + ',' + 'Y' + ')'
            elif edge[0] in ['X', 'Y']:
                edge_column = '(' + edge[1] + ',' + edge[0] + ')'
            elif edge[1] in ['X', 'Y']:
                edge_column = '(' + edge[0] + ',' + edge[1] + ')'
            elif int(edge[0]) < int(edge[1]):
                edge_column = '(' + edge[0] + ',' + edge[1] + ')'
            else:
                edge_column = '(' + edge[1] + ',' + edge[0] + ')'
            edge_weight = int(edge[2])
            dataset.loc[patient, edge_column] = edge_weight
        
        # now we load the breaks
        patient_path = PATIENTS_PATH + '/'+ patient + '.vcf.tsv'
        patient_breaks = pd.read_csv(patient_path, sep='\t', index_col=None)
        
        # load the chromosomes as strings
        patient_breaks['chrom2'] = patient_breaks['chrom2'].map(str)
        
        # generate a crosstab of the svclass with the chromosomes and add this info to the dataset
        ct = pd.crosstab(patient_breaks['chrom2'], patient_breaks['svclass'])
        ct.index = ct.index.map(str)
        
        for chrom in ct.index:
            for svc in ct.columns:
                dataset.loc[patient, svc + '_' + str(chrom)]= ct.loc[chrom, svc]
        
        # add the number of breaks
        number_of_breaks = len(patient_breaks)
        dataset.loc[patient, 'number_of_breaks'] = number_of_breaks
        
        # I count how many times appears on the breaks each of the chromosomes.
        contained_chromosomes = patient_breaks[['#chrom1', 'chrom2']].apply(pd.Series.value_counts)
        contained_chromosomes = contained_chromosomes.fillna(0)
        contained_chromosomes[['#chrom1', 'chrom2']] = contained_chromosomes[['#chrom1', 'chrom2']].astype(int)
        contained_chromosomes['chromosome'] = contained_chromosomes.index
        contained_chromosomes['count'] = contained_chromosomes['#chrom1'] + contained_chromosomes['chrom2']
        # Then saves it on the chromosome feature.
        for chrom in contained_chromosomes.index:
            dataset.loc[patient, 'chr_' + str(chrom)] = contained_chromosomes.loc[chrom, 'count']

        # Counts how many breaks of each class there are on the breaks and saves it.
        count_svclass = patient_breaks[['svclass', ]].apply(pd.Series.value_counts)
        for svclass in count_svclass.index:
            dataset.loc[patient, svclass] = count_svclass.loc[svclass, 'svclass']
    
    # fill with zeros the false nans generated now
    dataset.loc[:, dataset.columns != 'donor_age_at_diagnosis'] = dataset.loc[:, dataset.columns != 'donor_age_at_diagnosis'].fillna(0)
    
    if split:
        X_train, Y_train, X_test, Y_test = preprocessing(dataset, hist1)
        # and save
        X_train.to_csv(OUTPUT_PATH + '/' + name + '_train.csv')
        Y_train.to_csv(OUTPUT_PATH + '/' + name + '_train_target.csv')
        X_test.to_csv(OUTPUT_PATH + '/' + name + '_test.csv')
        Y_test.to_csv(OUTPUT_PATH + '/' + name + '_test_target.csv')
        return X_train, Y_train, X_test, Y_test
    else:
        dataset = preprocessing_without_split(dataset)
        dataset.to_csv(OUTPUT_PATH +'/'+ name + '.csv')
        return dataset

init = time.time()
name = 'dataset'
data = generate_dataset(name,split=True)
print 'Total time:', timedelta(seconds=time.time() - init)
data[0]

Generating csv..
Total time: 0:04:14.621338


(                                      donor_age_at_diagnosis  \
 sampleID                                                       
 f8a165b0-51df-413c-b7c6-c7fc80547db4                      71   
 17a66d24-de77-4f2a-a8ad-d2c6a69e7671                      37   
 e2b5c926-491d-430d-8d9c-5c73d7752d1b                      39   
 f87eb1b5-1712-ca1f-e040-11ac0c483848                      57   
 82b8cda8-fbff-455e-b0db-7ff6528bd6c8                      74   
 e2b09705-c5c8-48ee-a90e-19648a7bf2cb                      69   
 1127b561-ea40-4d5e-95df-daa0a5ebc1e4                      61   
 228fb827-c05e-494c-8a21-e1d925e100cb                      73   
 ab923db7-54e8-4a50-b7fd-c2b4b300041e                      56   
 cd0c3b1f-c73a-4991-8724-4e74381900cb                      55   
 4c755527-3b5d-4d36-822d-990aa1003d6a                      50   
 b7a7d93b-38a7-4fc3-a433-3bb0a8cb7c42                      73   
 10136472-c623-11e3-bf01-24c6515278c0                      54   
 d3aff5d3-23c0-43ae-9c01-

In [26]:
def load_data(name):
    # todo reformat 
    try:
        X_train =pd.read_csv(OUTPUT_PATH + '/' + name + '_train.csv',index_col=0)
        Y_train=pd.read_csv(OUTPUT_PATH + '/' + name + '_train_target.csv',index_col=0, names = ['SampleID','histology'])
        X_test=pd.read_csv(OUTPUT_PATH + '/' + name + '_test.csv',index_col=0)
        Y_test=pd.read_csv(OUTPUT_PATH + '/' + name + '_test_target.csv',index_col=0,names = ['SampleID','histology'])
        print 'Loaded' 
    except Exception as e:
        print 'peta', e
        return
    return X_train, Y_train, X_test, Y_test

X_train, Y_train, X_test, Y_test = load_data('dataset')
X_train.head()

Loaded


Unnamed: 0_level_0,donor_age_at_diagnosis,connected_components,connected_components_max_size,DEL_16,DUP_16,TRA_16,h2hINV_16,t2tINV_16,DEL_20,DUP_20,...,proportion_chr_7,proportion_chr_8,proportion_chr_9,proportion_chr_X,proportion_DEL_Y,proportion_DUP_Y,proportion_TRA_Y,proportion_h2hINV_Y,proportion_t2tINV_Y,proportion_chr_Y
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
f8a165b0-51df-413c-b7c6-c7fc80547db4,71,2.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.045455,0.0,0.068182,0.090909,0.0,0.0,0.0,0.0,0.0,0.0
17a66d24-de77-4f2a-a8ad-d2c6a69e7671,37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e2b5c926-491d-430d-8d9c-5c73d7752d1b,39,3.0,15.0,3.0,4.0,0.0,2.0,1.0,0.0,2.0,...,0.10219,0.036496,0.029197,0.0,0.0,0.0,0.007299,0.0,0.0,0.007299
f87eb1b5-1712-ca1f-e040-11ac0c483848,57,3.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.082192,0.246575,0.369863,0.027397,0.0,0.0,0.0,0.0,0.0,0.0
82b8cda8-fbff-455e-b0db-7ff6528bd6c8,74,3.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.072464,0.0,0.0,0.057971,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
a = set(data.columns)

In [22]:
a.difference(b)

{'histology_tier1', 'histology_tier2'}

In [23]:
b.difference(a)

set()

In [11]:
b = set(X_train.columns)