# GO Ontology


## Setup

In [1]:
import sys

sys.path.append("../working")

In [2]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [94]:
import glob
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
from omegaconf import OmegaConf
from progressbar import progressbar
from src.get_score import get_score
from src.load_data import LoadData, PostprocessData, PreprocessData
from src.preprocesses.cache import fit_instance, transform_data
from src.preprocesses.p010_pca import CustomPCA
from src.utils import choice_seed, df_stats, fix_seed

# from src.make_dataset import BaseDataset, get_transforms
# from src.make_model import ImageBaseModel
# from torch.utils.data import DataLoader

# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_colwidth", None)

In [4]:
# Competition specific library
import math

import scanpy as sc
import scipy.stats as stats
import umap
from anndata import AnnData
from ivis import Ivis
from sklearn.preprocessing import StandardScaler

In [5]:
c_main = OmegaConf.load("../working/config/main.yaml")
c_preprocess_params = OmegaConf.load("../working/config/preprocess_params.yaml")
c = OmegaConf.merge(c_main, c_preprocess_params)

c.global_params.data = "cite"

fix_seed(choice_seed(c))

2022-10-30 09:55:43,093 [INFO] [utils] Fix seed: 39


In [6]:
input = PreprocessData(c, do_preprocess=False)

2022-10-30 09:55:43,105 [INFO] [load_data] Load pickle file. path: ../input/evaluation_ids.pickle
2022-10-30 09:55:47,613 [INFO] [load_data] Load pickle file. path: ../input/metadata.pickle
2022-10-30 09:55:47,671 [INFO] [load_data] Load pickle file. path: ../input/sample_submission.pickle
2022-10-30 09:55:48,278 [INFO] [load_data] Load pickle file. path: ../input/test_cite_inputs.pickle
2022-10-30 09:55:55,981 [INFO] [load_data] Load pickle file. path: ../input/test_cite_inputs_day_2_donor_27678.pickle
2022-10-30 09:55:57,943 [INFO] [load_data] Load pickle file. path: ../input/train_cite_inputs.pickle
2022-10-30 09:56:08,936 [INFO] [load_data] Load pickle file. path: ../input/train_cite_targets.pickle


In [7]:
# input = LoadData(c, do_preprocess=False, use_fold=True)

In [8]:
# input = PostprocessData(c)

In [9]:
[col for col in dir(input) if not col.startswith("__")]

['c',
 'evaluation_ids',
 'metadata',
 'sample_submission',
 'test_cite_inputs',
 'test_cite_inputs_day_2_donor_27678',
 'train_cite_inputs',
 'train_cite_targets']

## Read Go Ontology

In [10]:
# http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/
goa_header = [
    "DB",
    "DB Object ID",
    "DB Object Symbol",
    "Qualifier",
    "GO ID",
    "DB:Reference",
    "Evidence Code",
    "With_From",
    "Aspect",
    "DB Object Name",
    "DB Object Synonym",
    "DB Object Type",
    "Taxon",
    "Date",
    "Assigned By",
    "Annotation Extension",
    "Gene Product Form ID",
]

In [11]:
goa = pd.read_table(os.path.join(c.settings.dirs.input, "goa_human.gaf"), names=goa_header, skiprows=41)
goa.dropna(axis=1, how="all", inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
goa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 635753 entries, 0 to 635752
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   DB                    635753 non-null  object
 1   DB Object ID          635753 non-null  object
 2   DB Object Symbol      635586 non-null  object
 3   Qualifier             635753 non-null  object
 4   GO ID                 635753 non-null  object
 5   DB:Reference          635753 non-null  object
 6   Evidence Code         635753 non-null  object
 7   With_From             395523 non-null  object
 8   Aspect                635753 non-null  object
 9   DB Object Name        635753 non-null  object
 10  DB Object Synonym     635450 non-null  object
 11  DB Object Type        635753 non-null  object
 12  Taxon                 635753 non-null  object
 13  Date                  635753 non-null  int64 
 14  Assigned By           635753 non-null  object
 15  Annotation Extens

In [13]:
goa.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With_From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20220907,UniProt,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20220907,UniProt,
2,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20161204,HPA,
3,UniProtKB,A0A075B6H7,IGKV3-7,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20220907,UniProt,
4,UniProtKB,A0A075B6H7,IGKV3-7,located_in,GO:0005886,GO_REF:0000044,IEA,UniProtKB-SubCell:SL-0039,C,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20220907,UniProt,


In [14]:
df_stats(goa)

Unnamed: 0,カラム名,ユニーク値数,最頻値,最頻値の出現回数,最頻値の割合,欠損値の数,欠損値の割合,タイプ
0,DB,1,UniProtKB,635753,100.0,0,0.0,object
1,DB Object ID,19861,P42858,1098,0.172709,0,0.0,object
2,DB Object Symbol,19791,HTT,1098,0.172709,167,0.026268,object
3,Qualifier,22,enables,289710,45.569584,0,0.0,object
4,GO ID,18892,GO:0005515,206265,32.444204,0,0.0,object
5,DB:Reference,54365,PMID:32296183,81758,12.860026,0,0.0,object
6,Evidence Code,21,IPI,217316,34.182458,0,0.0,object
7,With_From,62787,UniProtKB-KW:KW-0479,2299,37.786688,240230,37.786688,object
8,Aspect,3,F,291327,45.823928,0,0.0,object
9,DB Object Name,19638,Huntingtin,1098,0.172709,0,0.0,object


In [15]:
symbols = goa["DB Object Symbol"].unique()
symbols[:10]

array(['NUDT4B', 'IGKV3-7', 'IGKV1D-42', 'IGLV4-69', 'IGLV8-61',
       'IGLV4-60', 'IGLV11-55', 'IGLV10-54', 'IGLV1-50', 'IGLV5-48'],
      dtype=object)

In [40]:
goa[goa["DB Object Symbol"] == "ELOC"]

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With_From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension
315967,UniProtKB,Q15369,ELOC,enables,GO:0001222,PMID:7660122,IPI,UniProtKB:P40337,F,Elongin-C,ELOC|TCEB1,protein,taxon:9606,20210712,UniProt,
315968,UniProtKB,Q15369,ELOC,enables,GO:0005515,PMID:10205047,IPI,UniProtKB:Q15370,F,Elongin-C,ELOC|TCEB1,protein,taxon:9606,20220910,IntAct,
315969,UniProtKB,Q15369,ELOC,enables,GO:0005515,PMID:10851083,IPI,UniProtKB:Q9V3C1,F,Elongin-C,ELOC|TCEB1,protein,taxon:9606,20060208,UniProt,
315970,UniProtKB,Q15369,ELOC,enables,GO:0005515,PMID:12004076,IPI,UniProtKB:Q15370,F,Elongin-C,ELOC|TCEB1,protein,taxon:9606,20220910,IntAct,
315971,UniProtKB,Q15369,ELOC,enables,GO:0005515,PMID:12050673,IPI,UniProtKB:Q15370,F,Elongin-C,ELOC|TCEB1,protein,taxon:9606,20220910,IntAct,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316052,UniProtKB,Q15369,ELOC,part_of,GO:0031466,PMID:30166453,IDA,,C,Elongin-C,ELOC|TCEB1,protein,taxon:9606,20210630,UniProt,
316053,UniProtKB,Q15369,ELOC,part_of,GO:0070449,PMID:7660122,IDA,,C,Elongin-C,ELOC|TCEB1,protein,taxon:9606,20210712,UniProt,
583240,UniProtKB,Q15369,ELOC,involved_in,GO:0006511,PMID:21873635,IBA,PANTHER:PTN000464223|SGD:S000005967,P,Elongin-C,ELOC|TCEB1,protein,taxon:9606,20170228,GO_Central,
605080,UniProtKB,Q15369,ELOC,part_of,GO:0070449,PMID:21873635,IBA,PANTHER:PTN000464223|UniProtKB:Q15369|FB:FBgn0...,C,Elongin-C,ELOC|TCEB1,protein,taxon:9606,20211216,GO_Central,


## CITEseq Data

In [16]:
num_train = len(input.train_cite_inputs)

In [17]:
df = pd.concat([input.train_cite_inputs, input.test_cite_inputs])

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119651 entries, 45006fe3e4c8 to ad5a949989b2
Columns: 22050 entries, ENSG00000121410_A1BG to ENSG00000074755_ZZEF1
dtypes: float32(22050)
memory usage: 9.8+ GB


In [29]:
object_to_col = {}
for col in df.columns:
    object_id = col.split("_")[1]
    if object_id in object_to_col:
        object_to_col[object_id].append(col)
    else:
        object_to_col[object_id] = [col]

In [39]:
object_to_col["ELOC"]

['ENSG00000154582_ELOC']

In [32]:
len(object_to_col.keys())

21967

In [33]:
no_symbol = set(object_to_col.keys()) - set(symbols)
len(no_symbol)

8039

In [35]:
found_symbol = set(object_to_col.keys()) & set(symbols)
len(found_symbol)

13928

In [38]:
list(found_symbol)[:10]

['ELOC',
 'DOK6',
 'BUB1B',
 'TGIF2',
 'SDHD',
 'MEA1',
 'RNASE3',
 'RYBP',
 'ERGIC3',
 'HERPUD1']

In [41]:
# 最初に見つからなかった中で、ハイフンやドットで区切ると見つかるものがありそう
cols_2 = [re.split("[-.]", col)[0] for col in no_symbol]
cols_2[:10]

['CARMN',
 'HTATSF1P2',
 'CCT4P2',
 'VTRNA2',
 'AC095055',
 'C9orf24',
 'AC011447',
 'AL354710',
 'RPS20P15',
 'AL512604']

In [42]:
no_symbol_2 = set(cols_2) - set(symbols)
len(no_symbol_2)

6063

In [43]:
found_symbol_2 = set(cols_2) & set(symbols)
len(found_symbol_2)

641

In [46]:
found_symbols = found_symbol | found_symbol_2
len(found_symbols)

13958

## Mapping GO Ontology and Citeseq Data

In [77]:
cite_go_ontology = pd.DataFrame(index=df.index)
num_go_ontology = pd.DataFrame(index=df.index)

for symbol in progressbar(found_symbol):
    go_ids = goa[goa["DB Object Symbol"] == symbol]["GO ID"].unique()
    cite_data = df[object_to_col[symbol]].sum(axis=1)

    for go_id in go_ids:
        if go_id in cite_go_ontology.columns:
            cite_go_ontology[go_id] = cite_go_ontology[go_id] + cite_data
            num_go_ontology[go_id] = num_go_ontology[go_id] + 1
        else:
            cite_go_ontology[go_id] = cite_data
            num_go_ontology[go_id] = 1

  cite_go_ontology[go_id] = cite_data
  num_go_ontology[go_id] = 1
100% (13928 of 13928) |##################| Elapsed Time: 0:12:22 Time:  0:12:22


In [78]:
cite_go_ontology.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119651 entries, 45006fe3e4c8 to ad5a949989b2
Columns: 17483 entries, GO:0001222 to GO:0060658
dtypes: float32(17483)
memory usage: 7.8+ GB


In [79]:
cite_go_ontology.head()

Unnamed: 0_level_0,GO:0001222,GO:0005515,GO:0006357,GO:0006367,GO:0016567,GO:0005654,GO:0005829,GO:0031462,GO:0031466,GO:0070449,...,GO:0045851,GO:0097188,GO:0043914,GO:0070123,GO:0034699,GO:0060939,GO:0034875,GO:0036446,GO:0060427,GO:0060658
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,71.340088,13763.910156,1091.343628,77.638832,411.860107,6019.30957,7309.241211,35.437744,21.108782,10.863722,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d02759a80ba2,64.441284,13363.129883,1022.782654,74.700699,475.832916,5668.521973,7213.695312,34.341713,28.039604,11.857376,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c016c6b0efa5,98.262314,15772.179688,1456.634888,90.676903,601.000061,7168.018555,8491.908203,50.755211,21.374836,15.521999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ba7f733a4f75,97.958565,18864.982422,1603.502686,109.611298,732.496826,8162.648438,9916.801758,46.039417,24.905481,13.84063,...,3.436846,3.436846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fbcf2443ffb2,103.5793,19407.390625,1609.439575,102.596039,835.555664,8547.460938,10255.789062,52.123711,24.380133,15.289749,...,3.51861,3.51861,0.0,0.0,0.0,0.0,0.0,3.51861,3.51861,3.51861


In [80]:
num_go_ontology.head()

Unnamed: 0_level_0,GO:0001222,GO:0005515,GO:0006357,GO:0006367,GO:0016567,GO:0005654,GO:0005829,GO:0031462,GO:0031466,GO:0070449,...,GO:0045851,GO:0097188,GO:0043914,GO:0070123,GO:0034699,GO:0060939,GO:0034875,GO:0036446,GO:0060427,GO:0060658
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,44,10043,1269,39,412,3445,4630,20,7,3,...,1,1,1,1,1,1,1,1,1,1
d02759a80ba2,44,10043,1269,39,412,3445,4630,20,7,3,...,1,1,1,1,1,1,1,1,1,1
c016c6b0efa5,44,10043,1269,39,412,3445,4630,20,7,3,...,1,1,1,1,1,1,1,1,1,1
ba7f733a4f75,44,10043,1269,39,412,3445,4630,20,7,3,...,1,1,1,1,1,1,1,1,1,1
fbcf2443ffb2,44,10043,1269,39,412,3445,4630,20,7,3,...,1,1,1,1,1,1,1,1,1,1


In [81]:
cite_go_ontology = cite_go_ontology / num_go_ontology

In [82]:
cite_go_ontology.head()

Unnamed: 0_level_0,GO:0001222,GO:0005515,GO:0006357,GO:0006367,GO:0016567,GO:0005654,GO:0005829,GO:0031462,GO:0031466,GO:0070449,...,GO:0045851,GO:0097188,GO:0043914,GO:0070123,GO:0034699,GO:0060939,GO:0034875,GO:0036446,GO:0060427,GO:0060658
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.621366,1.370498,0.860003,1.990739,0.99966,1.74726,1.57867,1.771887,3.01554,3.621241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d02759a80ba2,1.464575,1.330591,0.805975,1.915403,1.154934,1.645435,1.558034,1.717086,4.005658,3.952459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c016c6b0efa5,2.233234,1.570465,1.14786,2.325049,1.458738,2.080702,1.834105,2.537761,3.053548,5.174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ba7f733a4f75,2.226331,1.878421,1.263595,2.810546,1.777905,2.369419,2.141858,2.301971,3.557926,4.613543,...,3.436846,3.436846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fbcf2443ffb2,2.354075,1.93243,1.268274,2.630668,2.028048,2.481121,2.215073,2.606186,3.482876,5.096583,...,3.51861,3.51861,0.0,0.0,0.0,0.0,0.0,3.51861,3.51861,3.51861


In [85]:
cite_go_ontology = cite_go_ontology.loc[:, cite_go_ontology.nunique() != 1]

In [86]:
cite_go_ontology.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119651 entries, 45006fe3e4c8 to ad5a949989b2
Columns: 17483 entries, GO:0001222 to GO:0060658
dtypes: float64(17483)
memory usage: 15.6+ GB


In [88]:
cite_go_ontology.head()

Unnamed: 0_level_0,GO:0001222,GO:0005515,GO:0006357,GO:0006367,GO:0016567,GO:0005654,GO:0005829,GO:0031462,GO:0031466,GO:0070449,...,GO:0045851,GO:0097188,GO:0043914,GO:0070123,GO:0034699,GO:0060939,GO:0034875,GO:0036446,GO:0060427,GO:0060658
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.621366,1.370498,0.860003,1.990739,0.99966,1.74726,1.57867,1.771887,3.01554,3.621241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d02759a80ba2,1.464575,1.330591,0.805975,1.915403,1.154934,1.645435,1.558034,1.717086,4.005658,3.952459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c016c6b0efa5,2.233234,1.570465,1.14786,2.325049,1.458738,2.080702,1.834105,2.537761,3.053548,5.174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ba7f733a4f75,2.226331,1.878421,1.263595,2.810546,1.777905,2.369419,2.141858,2.301971,3.557926,4.613543,...,3.436846,3.436846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fbcf2443ffb2,2.354075,1.93243,1.268274,2.630668,2.028048,2.481121,2.215073,2.606186,3.482876,5.096583,...,3.51861,3.51861,0.0,0.0,0.0,0.0,0.0,3.51861,3.51861,3.51861


In [89]:
train = cite_go_ontology.iloc[:num_train, :]
test = cite_go_ontology.iloc[num_train:, :]

In [90]:
train.to_pickle(os.path.join(c.settings.dirs.preprocess, "train_cite_ontology.pickle"))
test.to_pickle(os.path.join(c.settings.dirs.preprocess, "test_cite_ontology.pickle"))

## Preprocess Ontology

In [93]:
preprocessor = CustomPCA(c)

In [95]:
df = transform_data(
    c,
    f"cite_ontology_pca_240.pickle",
    cite_go_ontology,
    preprocessor,
)

2022-10-30 11:45:02,105 [INFO] [cache] Fit preprocess. -> cite_ontology_pca_240.pkl
2022-10-30 11:45:12,301 [INFO] [cache] Transform data. -> cite_ontology_pca_240.pickle, shape: (119651, 240)


In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119651 entries, 0 to 119650
Columns: 240 entries, pca_0 to pca_239
dtypes: float64(240)
memory usage: 219.1 MB


In [97]:
df.columns = [f"ontology_{col}" for col in df.columns]

In [108]:
df.index = cite_go_ontology.index

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119651 entries, 45006fe3e4c8 to ad5a949989b2
Columns: 240 entries, ontology_pca_0 to ontology_pca_239
dtypes: float64(240)
memory usage: 224.0+ MB


In [110]:
df.head()

Unnamed: 0_level_0,ontology_pca_0,ontology_pca_1,ontology_pca_2,ontology_pca_3,ontology_pca_4,ontology_pca_5,ontology_pca_6,ontology_pca_7,ontology_pca_8,ontology_pca_9,...,ontology_pca_230,ontology_pca_231,ontology_pca_232,ontology_pca_233,ontology_pca_234,ontology_pca_235,ontology_pca_236,ontology_pca_237,ontology_pca_238,ontology_pca_239
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,249.981062,-35.857283,17.475949,-1.579466,-22.398928,-7.164503,-16.207378,-38.427828,-23.152317,11.839326,...,3.720902,-5.872042,-2.277169,0.604545,0.56816,3.504771,0.256094,-5.540092,1.004745,-4.425052
d02759a80ba2,256.114003,-41.640885,16.905084,-11.56575,-27.462387,-10.0408,-7.650317,-28.703206,-22.312725,17.574842,...,3.174291,0.436437,-6.255381,2.106266,-1.24205,7.215686,1.462406,-1.78318,4.024222,6.374542
c016c6b0efa5,210.770019,-19.960447,53.153581,23.570778,-46.135397,-32.379101,-14.765692,-20.948803,-10.556714,29.92583,...,-0.20578,3.803088,-2.133962,-1.709004,4.769062,4.368081,-2.107958,0.001966,4.230699,-3.088538
ba7f733a4f75,175.842695,-10.074689,8.140918,1.095981,-43.266045,2.152704,-8.641681,-21.5122,-24.307419,7.722463,...,4.671872,-5.057515,2.15156,-0.815111,2.885576,2.397089,-1.945524,-5.437296,4.138482,2.95085
fbcf2443ffb2,159.832536,0.256617,46.126131,5.561793,-36.04247,-17.146846,-10.727227,-43.058065,-5.063246,8.571701,...,4.403537,-2.55614,-2.620641,-4.96821,2.656473,2.621489,-2.426246,-6.290363,0.357448,-5.089422


In [111]:
train = df.iloc[:num_train, :]
test = df.iloc[num_train:, :]

In [116]:
train.to_csv(os.path.join(c.settings.dirs.preprocess, "train_cite_ontology_pca_240.csv"), index=True, header=True)
test.to_csv(os.path.join(c.settings.dirs.preprocess, "test_cite_ontology_pca_240.csv"), index=True, header=True)

In [113]:
df.isnull().sum().sum()

0

In [114]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70988 entries, 45006fe3e4c8 to c91b6b2ccd3d
Columns: 240 entries, ontology_pca_0 to ontology_pca_239
dtypes: float64(240)
memory usage: 130.5+ MB


In [115]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48663 entries, c2150f55becb to ad5a949989b2
Columns: 240 entries, ontology_pca_0 to ontology_pca_239
dtypes: float64(240)
memory usage: 89.5+ MB


In [118]:
leak_df = pd.DataFrame(index=input.test_cite_inputs_day_2_donor_27678.index, columns=test.columns)

In [119]:
leak_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7016 entries, 83d6659a6a32 to 397bef68ded6
Columns: 240 entries, ontology_pca_0 to ontology_pca_239
dtypes: object(240)
memory usage: 12.9+ MB


In [120]:
leak_df.head()

Unnamed: 0,ontology_pca_0,ontology_pca_1,ontology_pca_2,ontology_pca_3,ontology_pca_4,ontology_pca_5,ontology_pca_6,ontology_pca_7,ontology_pca_8,ontology_pca_9,...,ontology_pca_230,ontology_pca_231,ontology_pca_232,ontology_pca_233,ontology_pca_234,ontology_pca_235,ontology_pca_236,ontology_pca_237,ontology_pca_238,ontology_pca_239
83d6659a6a32,,,,,,,,,,,...,,,,,,,,,,
d98594f13d2e,,,,,,,,,,,...,,,,,,,,,,
5f93d8ffc72f,,,,,,,,,,,...,,,,,,,,,,
7dfa2699d351,,,,,,,,,,,...,,,,,,,,,,
6d2533edd0e0,,,,,,,,,,,...,,,,,,,,,,


In [121]:
leak_df = leak_df.fillna(0)

In [122]:
leak_df.head()

Unnamed: 0,ontology_pca_0,ontology_pca_1,ontology_pca_2,ontology_pca_3,ontology_pca_4,ontology_pca_5,ontology_pca_6,ontology_pca_7,ontology_pca_8,ontology_pca_9,...,ontology_pca_230,ontology_pca_231,ontology_pca_232,ontology_pca_233,ontology_pca_234,ontology_pca_235,ontology_pca_236,ontology_pca_237,ontology_pca_238,ontology_pca_239
83d6659a6a32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
d98594f13d2e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5f93d8ffc72f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7dfa2699d351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6d2533edd0e0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
leak_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7016 entries, 83d6659a6a32 to 397bef68ded6
Columns: 240 entries, ontology_pca_0 to ontology_pca_239
dtypes: int64(240)
memory usage: 12.9+ MB


In [124]:
test = pd.concat([test, leak_df])

In [125]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55679 entries, c2150f55becb to 397bef68ded6
Columns: 240 entries, ontology_pca_0 to ontology_pca_239
dtypes: float64(240)
memory usage: 102.4+ MB


In [126]:
test.to_csv(os.path.join(c.settings.dirs.preprocess, "test_cite_ontology_pca_240.csv"), index=True, header=True)