# Title

## Setup

In [1]:
import sys

sys.path.append("../working")

In [2]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [37]:
import glob
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
from omegaconf import OmegaConf
from progressbar import progressbar
from src.get_score import get_score
from src.load_data import LoadData, PostprocessData, PreprocessData
from src.utils import choice_seed, df_stats, fix_seed

# from src.make_dataset import BaseDataset, get_transforms
# from src.make_model import ImageBaseModel
# from torch.utils.data import DataLoader

# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.max_colwidth", None)

In [4]:
# Competition specific library
import math

import scanpy as sc
import scipy.stats as stats
import umap
from anndata import AnnData
from ivis import Ivis
from sklearn.preprocessing import StandardScaler

In [38]:
c_main = OmegaConf.load("../working/config/main.yaml")
c_preprocess_params = OmegaConf.load("../working/config/preprocess_params.yaml")
c = OmegaConf.merge(c_main, c_preprocess_params)

fix_seed(choice_seed(c))

2022-10-28 22:52:38,782 [INFO] [utils] Fix seed: 3227


In [39]:
input = PreprocessData(c, do_preprocess=False)

2022-10-28 22:52:41,671 [INFO] [load_data] Load pickle file. path: ../input/evaluation_ids.pickle
2022-10-28 22:52:45,981 [INFO] [load_data] Load pickle file. path: ../input/metadata.pickle
2022-10-28 22:52:46,025 [INFO] [load_data] Load pickle file. path: ../input/sample_submission.pickle
2022-10-28 22:52:46,569 [INFO] [load_data] Load pickle file. path: ../input/test_cite_inputs.pickle
2022-10-28 22:52:53,757 [INFO] [load_data] Load pickle file. path: ../input/test_cite_inputs_day_2_donor_27678.pickle
2022-10-28 22:52:55,687 [INFO] [load_data] Load pickle file. path: ../input/train_cite_inputs.pickle
2022-10-28 22:53:06,323 [INFO] [load_data] Load pickle file. path: ../input/train_cite_targets.pickle


In [None]:
# input = LoadData(c, do_preprocess=False, use_fold=True)

In [None]:
# input = PostprocessData(c)

In [40]:
[col for col in dir(input) if not col.startswith("__")]

['c',
 'evaluation_ids',
 'metadata',
 'sample_submission',
 'test_cite_inputs',
 'test_cite_inputs_day_2_donor_27678',
 'train_cite_inputs',
 'train_cite_targets']

## Read Go Ontology

In [17]:
# http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/
goa_header = [
    "DB",
    "DB Object ID",
    "DB Object Symbol",
    "Qualifier",
    "GO ID",
    "DB:Reference",
    "Evidence Code",
    "With_From",
    "Aspect",
    "DB Object Name",
    "DB Object Synonym",
    "DB Object Type",
    "Taxon",
    "Date",
    "Assigned By",
    "Annotation Extension",
    "Gene Product Form ID",
]

In [44]:
goa = pd.read_table(os.path.join(c.settings.dirs.input, "goa_human.gaf"), names=goa_header, skiprows=41)
goa.dropna(axis=1, how="all", inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [42]:
goa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 635794 entries, 0 to 635793
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   DB                    635794 non-null  object 
 1   DB Object ID          635753 non-null  object 
 2   DB Object Symbol      635586 non-null  object 
 3   Qualifier             635753 non-null  object 
 4   GO ID                 635753 non-null  object 
 5   DB:Reference          635753 non-null  object 
 6   Evidence Code         635753 non-null  object 
 7   With_From             395523 non-null  object 
 8   Aspect                635753 non-null  object 
 9   DB Object Name        635753 non-null  object 
 10  DB Object Synonym     635450 non-null  object 
 11  DB Object Type        635753 non-null  object 
 12  Taxon                 635753 non-null  object 
 13  Date                  635753 non-null  float64
 14  Assigned By           635753 non-null  object 
 15  

In [45]:
goa.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With_From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20220907,UniProt,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20220907,UniProt,
2,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20161204,HPA,
3,UniProtKB,A0A075B6H7,IGKV3-7,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20220907,UniProt,
4,UniProtKB,A0A075B6H7,IGKV3-7,located_in,GO:0005886,GO_REF:0000044,IEA,UniProtKB-SubCell:SL-0039,C,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20220907,UniProt,


In [46]:
df_stats(goa)

Unnamed: 0,カラム名,ユニーク値数,最頻値,最頻値の出現回数,最頻値の割合,欠損値の数,欠損値の割合,タイプ
0,DB,1,UniProtKB,635753,100.0,0,0.0,object
1,DB Object ID,19861,P42858,1098,0.172709,0,0.0,object
2,DB Object Symbol,19791,HTT,1098,0.172709,167,0.026268,object
3,Qualifier,22,enables,289710,45.569584,0,0.0,object
4,GO ID,18892,GO:0005515,206265,32.444204,0,0.0,object
5,DB:Reference,54365,PMID:32296183,81758,12.860026,0,0.0,object
6,Evidence Code,21,IPI,217316,34.182458,0,0.0,object
7,With_From,62787,UniProtKB-KW:KW-0479,2299,37.786688,240230,37.786688,object
8,Aspect,3,F,291327,45.823928,0,0.0,object
9,DB Object Name,19638,Huntingtin,1098,0.172709,0,0.0,object


In [50]:
symbols = goa["DB Object Symbol"].unique()
symbols[:10]

array(['NUDT4B', 'IGKV3-7', 'IGKV1D-42', 'IGLV4-69', 'IGLV8-61',
       'IGLV4-60', 'IGLV11-55', 'IGLV10-54', 'IGLV1-50', 'IGLV5-48'],
      dtype=object)

## CITEseq Data

In [47]:
num_train = len(input.train_cite_inputs)

In [48]:
df = pd.concat([input.train_cite_inputs, input.test_cite_inputs])

In [49]:
cols = [col.split("_")[1] for col in df.columns]
cols[:10]

['A1BG',
 'A1BG-AS1',
 'A2M',
 'A2M-AS1',
 'A2ML1',
 'A4GALT',
 'AAAS',
 'AACS',
 'AADAT',
 'AAGAB']

In [51]:
len(cols)

22050

In [75]:
# _ で切ったとき 同じシンボルの列が複数ある
len(set(cols))

21967

In [52]:
no_symbol = set(cols) - set(symbols)
len(no_symbol)

8039

In [67]:
found_symbol = set(cols) & set(symbols)
len(found_symbol)

13928

In [62]:
cols_2 = [re.split("[-.]", col)[0] for col in no_symbol]
cols_2[:10]

['AC092683',
 'AC055822',
 'UMODL1',
 'AL355488',
 'FAM49A',
 'NACAP8',
 'LINC02415',
 'AC004381',
 'AC019226',
 'RN7SL566P']

In [63]:
no_symbol_2 = set(cols_2) - set(symbols)
len(no_symbol_2)

6063

In [69]:
found_symbol_2 = set(cols_2) & set(symbols)
len(found_symbol_2)

641

In [71]:
found_cols = (set(cols) & set(symbols)) | (set(cols_2) & set(symbols))
len(found_cols)

13958

TODO: symbols を column に、 cell を index に持つ df つくる
値は、 RNA の値をそのまま該当する GO の列にマッピングしてみる