# GO Ontology


## Setup

In [1]:
import sys

sys.path.append("../working")

In [2]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [3]:
import glob
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
from omegaconf import OmegaConf
from progressbar import progressbar
from src.get_score import get_score
from src.load_data import LoadData, PostprocessData, PreprocessData
from src.preprocesses.cache import fit_instance, transform_data
from src.preprocesses.p010_pca import CustomPCA
from src.utils import choice_seed, df_stats, fix_seed

# from src.make_dataset import BaseDataset, get_transforms
# from src.make_model import ImageBaseModel
# from torch.utils.data import DataLoader

# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_colwidth", None)

In [4]:
# Competition specific library
import math

import scanpy as sc
import scipy.stats as stats
import umap
from anndata import AnnData
from ivis import Ivis
from sklearn.preprocessing import StandardScaler

In [5]:
c_main = OmegaConf.load("../working/config/main.yaml")
c_preprocess_params = OmegaConf.load("../working/config/preprocess_params.yaml")
c = OmegaConf.merge(c_main, c_preprocess_params)

c.global_params.data = "cite"

fix_seed(choice_seed(c))

2022-10-30 22:31:29,051 [INFO] [utils] Fix seed: 3086


In [6]:
input = PreprocessData(c, do_preprocess=False)

2022-10-30 22:31:29,064 [INFO] [load_data] Load pickle file. path: ../input/evaluation_ids.pickle
2022-10-30 22:31:33,346 [INFO] [load_data] Load pickle file. path: ../input/metadata.pickle
2022-10-30 22:31:33,402 [INFO] [load_data] Load pickle file. path: ../input/sample_submission.pickle
2022-10-30 22:31:34,022 [INFO] [load_data] Load pickle file. path: ../input/test_cite_inputs.pickle
2022-10-30 22:31:36,485 [INFO] [load_data] Load pickle file. path: ../input/test_cite_inputs_day_2_donor_27678.pickle
2022-10-30 22:31:37,153 [INFO] [load_data] Load pickle file. path: ../input/train_cite_inputs.pickle
2022-10-30 22:31:40,783 [INFO] [load_data] Load pickle file. path: ../input/train_cite_targets.pickle


In [7]:
# input = LoadData(c, do_preprocess=False, use_fold=True)

In [8]:
# input = PostprocessData(c)

In [9]:
[col for col in dir(input) if not col.startswith("__")]

['c',
 'evaluation_ids',
 'metadata',
 'sample_submission',
 'test_cite_inputs',
 'test_cite_inputs_day_2_donor_27678',
 'train_cite_inputs',
 'train_cite_targets']

## Read Go Ontology

In [10]:
# http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/
goa_header = [
    "DB",
    "DB Object ID",
    "DB Object Symbol",
    "Qualifier",
    "GO ID",
    "DB:Reference",
    "Evidence Code",
    "With_From",
    "Aspect",
    "DB Object Name",
    "DB Object Synonym",
    "DB Object Type",
    "Taxon",
    "Date",
    "Assigned By",
    "Annotation Extension",
    "Gene Product Form ID",
]

In [11]:
goa = pd.read_table(os.path.join(c.settings.dirs.input, "goa_human.gaf"), names=goa_header, skiprows=41)
goa.dropna(axis=1, how="all", inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
goa = goa[goa["Aspect"] == "P"]

In [13]:
goa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165338 entries, 3 to 635748
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   DB                    165338 non-null  object
 1   DB Object ID          165338 non-null  object
 2   DB Object Symbol      165272 non-null  object
 3   Qualifier             165338 non-null  object
 4   GO ID                 165338 non-null  object
 5   DB:Reference          165338 non-null  object
 6   Evidence Code         165338 non-null  object
 7   With_From             94486 non-null   object
 8   Aspect                165338 non-null  object
 9   DB Object Name        165338 non-null  object
 10  DB Object Synonym     165227 non-null  object
 11  DB Object Type        165338 non-null  object
 12  Taxon                 165338 non-null  object
 13  Date                  165338 non-null  int64 
 14  Assigned By           165338 non-null  object
 15  Annotation Extens

In [14]:
goa.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With_From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension
3,UniProtKB,A0A075B6H7,IGKV3-7,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20220907,UniProt,
6,UniProtKB,A0A075B6H8,IGKV1D-42,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV1D-42,protein,taxon:9606,20220907,UniProt,
9,UniProtKB,A0A075B6H9,IGLV4-69,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Immunoglobulin lambda variable 4-69,IGLV4-69,protein,taxon:9606,20220907,UniProt,
12,UniProtKB,A0A075B6I0,IGLV8-61,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Immunoglobulin lambda variable 8-61,IGLV8-61,protein,taxon:9606,20220907,UniProt,
15,UniProtKB,A0A075B6I1,IGLV4-60,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Immunoglobulin lambda variable 4-60,IGLV4-60,protein,taxon:9606,20220907,UniProt,


In [15]:
df_stats(goa)

Unnamed: 0,カラム名,ユニーク値数,最頻値,最頻値の出現回数,最頻値の割合,欠損値の数,欠損値の割合,タイプ
0,DB,1,UniProtKB,165338,100.0,0,0.0,object
1,DB Object ID,18040,P01375,279,0.168745,0,0.0,object
2,DB Object Symbol,17992,TNF,279,0.168745,66,0.039918,object
3,Qualifier,10,involved_in,161777,97.84623,0,0.0,object
4,GO ID,12550,GO:0006357,1908,1.154,0,0.0,object
5,DB:Reference,26018,GO_REF:0000107,29522,17.855544,0,0.0,object
6,Evidence Code,20,IEA,43567,26.350264,0,0.0,object
7,With_From,25934,UniProtKB-KW:KW-0297,461,42.852823,70852,42.852823,object
8,Aspect,1,P,165338,100.0,0,0.0,object
9,DB Object Name,17862,Tumor necrosis factor,279,0.168745,0,0.0,object


In [16]:
symbols = goa["DB Object Symbol"].unique()
symbols[:10]

array(['IGKV3-7', 'IGKV1D-42', 'IGLV4-69', 'IGLV8-61', 'IGLV4-60',
       'IGLV11-55', 'IGLV10-54', 'IGLV1-50', 'IGLV5-48', 'IGLV7-46'],
      dtype=object)

## CITEseq Data

In [17]:
num_train = len(input.train_cite_inputs)

In [18]:
df = pd.concat([input.train_cite_inputs, input.test_cite_inputs])

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119651 entries, 45006fe3e4c8 to ad5a949989b2
Columns: 22050 entries, ENSG00000121410_A1BG to ENSG00000074755_ZZEF1
dtypes: float32(22050)
memory usage: 9.8+ GB


In [20]:
object_to_col = {}
for col in df.columns:
    object_id = col.split("_")[1]
    if object_id in object_to_col:
        object_to_col[object_id].append(col)
    else:
        object_to_col[object_id] = [col]

In [21]:
object_to_col["ELOC"]

['ENSG00000154582_ELOC']

In [22]:
len(object_to_col.keys())

21967

In [23]:
no_symbol = set(object_to_col.keys()) - set(symbols)
len(no_symbol)

9041

In [24]:
found_symbol = set(object_to_col.keys()) & set(symbols)
len(found_symbol)

12926

In [25]:
list(found_symbol)[:10]

['LRRC59',
 'CRABP1',
 'EVA1C',
 'STOML2',
 'RPL4',
 'RCC2',
 'RBM28',
 'WTIP',
 'PLPP2',
 'ZNHIT3']

In [26]:
# 最初に見つからなかった中で、ハイフンやドットで区切ると見つかるものがありそう
cols_2 = [re.split("[-.]", col)[0] for col in no_symbol]
cols_2[:10]

['Z98742',
 'TP53TG3D',
 'LINC01224',
 'DRC3',
 'AC010976',
 'AC093673',
 'AC022400',
 'AC016542',
 'AL137145',
 'AC096667']

In [27]:
no_symbol_2 = set(cols_2) - set(symbols)
len(no_symbol_2)

7060

In [28]:
found_symbol_2 = set(cols_2) & set(symbols)
len(found_symbol_2)

617

In [29]:
found_symbols = found_symbol | found_symbol_2
len(found_symbols)

12953

## Mapping GO Ontology and Citeseq Data

In [30]:
cite_go_ontology = pd.DataFrame(index=df.index)
num_go_ontology = pd.DataFrame(index=df.index)

for symbol in progressbar(found_symbol):
    go_ids = goa[goa["DB Object Symbol"] == symbol]["GO ID"].unique()
    cite_data = df[object_to_col[symbol]].sum(axis=1)

    for go_id in go_ids:
        if go_id in cite_go_ontology.columns:
            cite_go_ontology[go_id] = cite_go_ontology[go_id] + cite_data
            num_go_ontology[go_id] = num_go_ontology[go_id] + 1
        else:
            cite_go_ontology[go_id] = cite_data
            num_go_ontology[go_id] = 1

  cite_go_ontology[go_id] = cite_data
  num_go_ontology[go_id] = 1
100% (12926 of 12926) |##################| Elapsed Time: 0:04:07 Time:  0:04:07


In [31]:
cite_go_ontology.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119651 entries, 45006fe3e4c8 to ad5a949989b2
Columns: 11590 entries, GO:0046579 to GO:0038170
dtypes: float32(11590)
memory usage: 5.2+ GB


In [32]:
cite_go_ontology.head()

Unnamed: 0_level_0,GO:0046579,GO:0007165,GO:0034653,GO:0015908,GO:0008150,GO:0006851,GO:0006874,GO:0007005,GO:0010876,GO:0010918,...,GO:0070781,GO:0051695,GO:0032776,GO:0071040,GO:0046168,GO:1902380,GO:0006152,GO:0045761,GO:1904274,GO:0038170
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,14.328962,758.758911,0.0,11.160076,327.405762,36.778496,74.183853,125.919083,5.463848,23.882996,...,0.0,0.0,0.0,0.0,4.090185,0.0,5.8679,5.177577,0.0,0.0
d02759a80ba2,4.72385,774.736938,0.0,20.87694,277.097198,19.224245,62.797939,133.98233,6.421305,35.319893,...,0.0,0.0,0.0,0.0,0.0,4.039545,6.421305,4.039545,0.0,0.0
c016c6b0efa5,3.847321,833.125671,0.0,17.567362,348.868042,38.824333,71.506027,147.847809,9.064805,28.098894,...,0.0,0.0,0.0,0.0,0.0,0.0,5.217484,0.0,0.0,0.0
ba7f733a4f75,15.665339,1060.752563,0.0,22.687859,423.744476,26.411469,83.380821,180.319901,5.605062,36.309273,...,0.0,0.0,0.0,0.0,0.0,3.436846,4.11378,4.79872,0.0,0.0
fbcf2443ffb2,17.073271,1096.9646,0.0,20.231289,472.877014,26.963724,85.547974,179.950974,5.571774,33.829514,...,0.0,0.0,3.51861,0.0,0.0,5.10405,5.689135,5.10405,0.0,0.0


In [33]:
num_go_ontology.head()

Unnamed: 0_level_0,GO:0046579,GO:0007165,GO:0034653,GO:0015908,GO:0008150,GO:0006851,GO:0006874,GO:0007005,GO:0010876,GO:0010918,...,GO:0070781,GO:0051695,GO:0032776,GO:0071040,GO:0046168,GO:1902380,GO:0006152,GO:0045761,GO:1904274,GO:0038170
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,17,844,2,16,372,12,71,75,3,10,...,1,1,1,1,1,1,1,1,1,1
d02759a80ba2,17,844,2,16,372,12,71,75,3,10,...,1,1,1,1,1,1,1,1,1,1
c016c6b0efa5,17,844,2,16,372,12,71,75,3,10,...,1,1,1,1,1,1,1,1,1,1
ba7f733a4f75,17,844,2,16,372,12,71,75,3,10,...,1,1,1,1,1,1,1,1,1,1
fbcf2443ffb2,17,844,2,16,372,12,71,75,3,10,...,1,1,1,1,1,1,1,1,1,1


In [34]:
cite_go_ontology = cite_go_ontology / num_go_ontology

In [35]:
cite_go_ontology.head()

Unnamed: 0_level_0,GO:0046579,GO:0007165,GO:0034653,GO:0015908,GO:0008150,GO:0006851,GO:0006874,GO:0007005,GO:0010876,GO:0010918,...,GO:0070781,GO:0051695,GO:0032776,GO:0071040,GO:0046168,GO:1902380,GO:0006152,GO:0045761,GO:1904274,GO:0038170
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,0.84288,0.899003,0.0,0.697505,0.880123,3.064875,1.044843,1.678921,1.821283,2.3883,...,0.0,0.0,0.0,0.0,4.090185,0.0,5.8679,5.177577,0.0,0.0
d02759a80ba2,0.277874,0.917935,0.0,1.304809,0.744885,1.60202,0.884478,1.786431,2.140435,3.531989,...,0.0,0.0,0.0,0.0,0.0,4.039545,6.421305,4.039545,0.0,0.0
c016c6b0efa5,0.226313,0.987116,0.0,1.09796,0.937817,3.235361,1.007127,1.971304,3.021602,2.809889,...,0.0,0.0,0.0,0.0,0.0,0.0,5.217484,0.0,0.0,0.0
ba7f733a4f75,0.921491,1.256816,0.0,1.417991,1.139098,2.200956,1.174378,2.404265,1.868354,3.630927,...,0.0,0.0,0.0,0.0,0.0,3.436846,4.11378,4.79872,0.0,0.0
fbcf2443ffb2,1.00431,1.299721,0.0,1.264456,1.271175,2.246977,1.204901,2.399346,1.857258,3.382951,...,0.0,0.0,3.51861,0.0,0.0,5.10405,5.689135,5.10405,0.0,0.0


In [36]:
cite_go_ontology = cite_go_ontology.loc[:, cite_go_ontology.nunique() != 1]

In [37]:
cite_go_ontology.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119651 entries, 45006fe3e4c8 to ad5a949989b2
Columns: 11590 entries, GO:0046579 to GO:0038170
dtypes: float64(11590)
memory usage: 10.3+ GB


In [38]:
cite_go_ontology.head()

Unnamed: 0_level_0,GO:0046579,GO:0007165,GO:0034653,GO:0015908,GO:0008150,GO:0006851,GO:0006874,GO:0007005,GO:0010876,GO:0010918,...,GO:0070781,GO:0051695,GO:0032776,GO:0071040,GO:0046168,GO:1902380,GO:0006152,GO:0045761,GO:1904274,GO:0038170
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,0.84288,0.899003,0.0,0.697505,0.880123,3.064875,1.044843,1.678921,1.821283,2.3883,...,0.0,0.0,0.0,0.0,4.090185,0.0,5.8679,5.177577,0.0,0.0
d02759a80ba2,0.277874,0.917935,0.0,1.304809,0.744885,1.60202,0.884478,1.786431,2.140435,3.531989,...,0.0,0.0,0.0,0.0,0.0,4.039545,6.421305,4.039545,0.0,0.0
c016c6b0efa5,0.226313,0.987116,0.0,1.09796,0.937817,3.235361,1.007127,1.971304,3.021602,2.809889,...,0.0,0.0,0.0,0.0,0.0,0.0,5.217484,0.0,0.0,0.0
ba7f733a4f75,0.921491,1.256816,0.0,1.417991,1.139098,2.200956,1.174378,2.404265,1.868354,3.630927,...,0.0,0.0,0.0,0.0,0.0,3.436846,4.11378,4.79872,0.0,0.0
fbcf2443ffb2,1.00431,1.299721,0.0,1.264456,1.271175,2.246977,1.204901,2.399346,1.857258,3.382951,...,0.0,0.0,3.51861,0.0,0.0,5.10405,5.689135,5.10405,0.0,0.0


In [39]:
train = cite_go_ontology.iloc[:num_train, :]
test = cite_go_ontology.iloc[num_train:, :]

In [40]:
# train.to_pickle(os.path.join(c.settings.dirs.preprocess, "train_cite_ontology.pickle"))
# test.to_pickle(os.path.join(c.settings.dirs.preprocess, "test_cite_ontology.pickle"))

## Preprocess Ontology

In [41]:
preprocessor = CustomPCA(c)

In [42]:
df = transform_data(
    c,
    f"cite_ontology_pca_240_p.pickle",
    cite_go_ontology,
    preprocessor,
)

2022-10-30 22:37:26,343 [INFO] [cache] Fit preprocess. -> cite_ontology_pca_240_p.pkl
2022-10-30 22:37:32,301 [INFO] [cache] Transform data. -> cite_ontology_pca_240_p.pickle, shape: (119651, 240)


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119651 entries, 0 to 119650
Columns: 240 entries, pca_0 to pca_239
dtypes: float64(240)
memory usage: 219.1 MB


In [44]:
df.columns = [f"ontology_p_{col}" for col in df.columns]

In [45]:
df.index = cite_go_ontology.index

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119651 entries, 45006fe3e4c8 to ad5a949989b2
Columns: 240 entries, ontology_p_pca_0 to ontology_p_pca_239
dtypes: float64(240)
memory usage: 220.0+ MB


In [47]:
df.head()

Unnamed: 0_level_0,ontology_p_pca_0,ontology_p_pca_1,ontology_p_pca_2,ontology_p_pca_3,ontology_p_pca_4,ontology_p_pca_5,ontology_p_pca_6,ontology_p_pca_7,ontology_p_pca_8,ontology_p_pca_9,...,ontology_p_pca_230,ontology_p_pca_231,ontology_p_pca_232,ontology_p_pca_233,ontology_p_pca_234,ontology_p_pca_235,ontology_p_pca_236,ontology_p_pca_237,ontology_p_pca_238,ontology_p_pca_239
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,199.878832,-27.307297,14.313883,-5.885423,17.603279,-6.258463,4.43297,-1.588889,-32.022738,23.828702,...,-9.566534,2.574488,-0.439621,-2.287016,-1.760439,-1.55416,-3.798081,-4.094535,4.513131,6.929143
d02759a80ba2,207.220903,-31.541241,10.514626,-14.345443,20.025015,-7.891855,0.345156,3.490145,-23.676943,17.097703,...,2.627405,0.413142,-3.001637,0.833088,-0.113433,-1.512154,-3.945969,4.607315,-4.344857,9.565384
c016c6b0efa5,164.256194,-19.810477,42.598382,7.463793,44.586438,-28.155507,10.606919,5.561516,-8.951865,28.333241,...,1.869462,3.914407,-3.523777,-3.814722,-0.349887,-3.910911,-2.167811,-3.053174,1.686517,3.592622
ba7f733a4f75,142.040693,-9.459668,4.325146,-8.938186,35.859596,-0.186231,-7.977301,-2.51129,-18.479455,17.417336,...,5.603799,5.696946,-1.979835,3.951405,-1.91476,-2.226889,-5.536681,-2.426527,5.792479,4.682651
fbcf2443ffb2,124.862478,-3.733909,37.194395,-4.85674,31.546649,-15.145315,13.028,10.016005,-25.786684,23.680938,...,3.310603,4.405058,2.440917,-0.430432,3.428117,-1.115155,-3.139567,-5.252854,-0.613433,3.430299


In [48]:
train = df.iloc[:num_train, :]
test = df.iloc[num_train:, :]

In [49]:
train.to_csv(os.path.join(c.settings.dirs.preprocess, "train_cite_ontology_pca_240_p.csv"), index=True, header=True)

In [50]:
df.isnull().sum().sum()

0

In [51]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70988 entries, 45006fe3e4c8 to c91b6b2ccd3d
Columns: 240 entries, ontology_p_pca_0 to ontology_p_pca_239
dtypes: float64(240)
memory usage: 130.5+ MB


In [52]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48663 entries, c2150f55becb to ad5a949989b2
Columns: 240 entries, ontology_p_pca_0 to ontology_p_pca_239
dtypes: float64(240)
memory usage: 89.5+ MB


In [53]:
leak_df = pd.DataFrame(index=input.test_cite_inputs_day_2_donor_27678.index, columns=test.columns)

In [54]:
leak_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7016 entries, 83d6659a6a32 to 397bef68ded6
Columns: 240 entries, ontology_p_pca_0 to ontology_p_pca_239
dtypes: object(240)
memory usage: 12.9+ MB


In [55]:
leak_df.head()

Unnamed: 0,ontology_p_pca_0,ontology_p_pca_1,ontology_p_pca_2,ontology_p_pca_3,ontology_p_pca_4,ontology_p_pca_5,ontology_p_pca_6,ontology_p_pca_7,ontology_p_pca_8,ontology_p_pca_9,...,ontology_p_pca_230,ontology_p_pca_231,ontology_p_pca_232,ontology_p_pca_233,ontology_p_pca_234,ontology_p_pca_235,ontology_p_pca_236,ontology_p_pca_237,ontology_p_pca_238,ontology_p_pca_239
83d6659a6a32,,,,,,,,,,,...,,,,,,,,,,
d98594f13d2e,,,,,,,,,,,...,,,,,,,,,,
5f93d8ffc72f,,,,,,,,,,,...,,,,,,,,,,
7dfa2699d351,,,,,,,,,,,...,,,,,,,,,,
6d2533edd0e0,,,,,,,,,,,...,,,,,,,,,,


In [56]:
leak_df = leak_df.fillna(0)

In [57]:
leak_df.head()

Unnamed: 0,ontology_p_pca_0,ontology_p_pca_1,ontology_p_pca_2,ontology_p_pca_3,ontology_p_pca_4,ontology_p_pca_5,ontology_p_pca_6,ontology_p_pca_7,ontology_p_pca_8,ontology_p_pca_9,...,ontology_p_pca_230,ontology_p_pca_231,ontology_p_pca_232,ontology_p_pca_233,ontology_p_pca_234,ontology_p_pca_235,ontology_p_pca_236,ontology_p_pca_237,ontology_p_pca_238,ontology_p_pca_239
83d6659a6a32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
d98594f13d2e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5f93d8ffc72f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7dfa2699d351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6d2533edd0e0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
leak_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7016 entries, 83d6659a6a32 to 397bef68ded6
Columns: 240 entries, ontology_p_pca_0 to ontology_p_pca_239
dtypes: int64(240)
memory usage: 12.9+ MB


In [59]:
test = pd.concat([test, leak_df])

In [60]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55679 entries, c2150f55becb to 397bef68ded6
Columns: 240 entries, ontology_p_pca_0 to ontology_p_pca_239
dtypes: float64(240)
memory usage: 102.4+ MB


In [61]:
test.to_csv(os.path.join(c.settings.dirs.preprocess, "test_cite_ontology_pca_240_p.csv"), index=True, header=True)