# このnotebookについて
Citeseqのinputsのカラムと、targetsのカラムを、それぞれ`num_of_inputs_columns`、`num_of_targets_columns`個ずつランダムサンプリングし、それでcatboostの学習を回す。
以下の例では、10*100 = 1000回ランダムサンプリングし、学習させている。
```
for target_seed in targets_seeds = range(1130,1140):
    for inputs_seed in inputs_seeds = range(100,200):
        model.fit(Xtr,ytr)
```
その後、それぞれのseedについて、相関係数（score）を計算し、score_dfとして表示＆保存する。

# train cite

## import, config, install, score func

In [None]:
CFG_is_Colab = True

In [None]:
import os

if CFG_is_Colab:
    FP_INPUT_DIR = '/content/drive/MyDrive/kaggle/single-cell-competition/data/input/'
    FP_OUTPUT_DIR = '/content/drive/MyDrive/kaggle/single-cell-competition/data/output/'
    
    from google.colab import drive
    drive.mount('/content/drive')
else:
    FP_INPUT_DIR = '../input/'
    FP_OUTPUT_DIR = './'


FP_SPARSE_DATA = os.path.join(FP_INPUT_DIR, "multimodal-single-cell-as-sparse-matrix/")
FP_PCA_DATA = os.path.join(FP_INPUT_DIR, "single-cell-multi-svd-pkl/") 

FP_CELL_METADATA = os.path.join(FP_SPARSE_DATA,"metadata.parquet")

FP_CITE_TRAIN_INPUTS_VALUES = os.path.join(FP_SPARSE_DATA,"train_cite_inputs_values.sparse.npz")
FP_CITE_TRAIN_INPUTS_IDXCOL = os.path.join(FP_SPARSE_DATA,"train_cite_inputs_idxcol.npz")
FP_CITE_TRAIN_TARGETS_VALUES = os.path.join(FP_SPARSE_DATA,"train_cite_targets_values.sparse.npz")
FP_CITE_TRAIN_TARGETS_IDXCOL = os.path.join(FP_SPARSE_DATA,"train_cite_targets_idxcol.npz")
FP_CITE_TEST_INPUTS_VALUES = os.path.join(FP_SPARSE_DATA,"test_cite_inputs_values.sparse.npz")
FP_CITE_TEST_INPUTS_IDXCOL = os.path.join(FP_SPARSE_DATA,"test_cite_inputs_idxcol.npz")

FP_MULTI_TRAIN_INPUTS_VALUES = os.path.join(FP_SPARSE_DATA,"train_multi_inputs_values.sparse.npz")
FP_MULTI_TRAIN_INPUTS_IDXCOL = os.path.join(FP_SPARSE_DATA,"train_multi_inputs_idxcol.npz")
FP_MULTI_TRAIN_TARGETS_VALUES = os.path.join(FP_SPARSE_DATA,"train_multi_targets_values.sparse.npz")
FP_MULTI_TRAIN_TARGETS_IDXCOL = os.path.join(FP_SPARSE_DATA,"train_multi_targets_idxcol.npz")
FP_MULTI_TEST_INPUTS_VALUES = os.path.join(FP_SPARSE_DATA,"test_multi_inputs_values.sparse.npz")
FP_MULTI_TEST_INPUTS_IDXCOL = os.path.join(FP_SPARSE_DATA,"test_multi_inputs_idxcol.npz")

FP_SUBMISSION = os.path.join(FP_SPARSE_DATA,"sample_submission.parquet")
FP_EVALUATION = os.path.join(FP_SPARSE_DATA,"evaluation.parquet")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install colorama
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os, gc, pickle, warnings
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style
# from matplotlib.ticker import MaxNLocator
import seaborn as sns
import os,gc,time,random
from tqdm.notebook import tqdm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, scale
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

import scipy
import scipy.sparse
from scipy.sparse import csc_matrix, csr_matrix, coo_matrix, lil_matrix

from catboost import CatBoostRegressor

warnings.filterwarnings('ignore')

In [None]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)


## column selection, row selection, train models

In [None]:
# read data

is_cite = False

if is_cite:
    %time train_inputs = scipy.sparse.load_npz(FP_CITE_TRAIN_INPUTS_VALUES)
    %time train_targets = scipy.sparse.load_npz(FP_CITE_TRAIN_TARGETS_VALUES)
    inputs_cols = np.load(FP_CITE_TRAIN_INPUTS_IDXCOL,allow_pickle=True)["columns"]
    targets_cols = np.load(FP_CITE_TRAIN_TARGETS_IDXCOL,allow_pickle=True)["columns"]
    indices = np.load(FP_CITE_TRAIN_INPUTS_IDXCOL,allow_pickle=True)["index"]
else:
    %time train_inputs = scipy.sparse.load_npz(FP_MULTI_TRAIN_INPUTS_VALUES)
    %time train_targets = scipy.sparse.load_npz(FP_MULTI_TRAIN_TARGETS_VALUES)
    inputs_cols = np.load(FP_MULTI_TRAIN_INPUTS_IDXCOL,allow_pickle=True)["columns"]
    targets_cols = np.load(FP_MULTI_TRAIN_TARGETS_IDXCOL,allow_pickle=True)["columns"]
    indices = np.load(FP_MULTI_TRAIN_INPUTS_IDXCOL,allow_pickle=True)["index"]

# model config
from catboost import CatBoostRegressor
params = {'learning_rate': 0.1, 
          'depth': 7, 
          'l2_leaf_reg': 4, 
          'loss_function': 'MultiRMSE', 
          'eval_metric': 'MultiRMSE', 
          'task_type': 'CPU', 
          'iterations': 200,
          'od_type': 'Iter', 
          'boosting_type': 'Plain', 
          'bootstrap_type': 'Bayesian', 
          'allow_const_label': True, 
          'random_state': 1
         }
model = CatBoostRegressor(**params)

CPU times: user 43.7 s, sys: 5.51 s, total: 49.3 s
Wall time: 1min 10s
CPU times: user 19.8 s, sys: 3.81 s, total: 23.6 s
Wall time: 28 s


In [None]:

# inputs_random, _ ,selected_indices = row_col_select(
#             train_inputs,train_targets,inputs_cols,targets_cols,seed=33,
#             num_of_inputs_columns=1000, num_of_indices=10000
#             )
# pcaX = TruncatedSVD(n_components=64)
# X = slice_sparse_matrix_by_columns_list(train_inputs[selected_indices],inputs_cols,inputs_random)
# inputs_pca = pcaX.fit_transform(X)
# # print(inputs_pca)
# print(np.sum(pcaX.explained_variance_ratio_))


In [None]:
def row_col_select(train_inputs,train_targets,inputs_cols,\
                   targets_cols,seed, num_of_inputs_columns=None,\
                   num_of_targets_columns=None, num_of_indices=None):
    '''
    # Example of use:
    ```
    # read data
    train_inputs = scipy.sparse.load_npz(FP_CITE_TRAIN_INPUTS_VALUES)
    train_targets = scipy.sparse.load_npz(FP_CITE_TRAIN_TARGETS_VALUES)
    inputs_cols = np.load(FP_CITE_TRAIN_INPUTS_IDXCOL,allow_pickle=True)["columns"]
    targets_cols = np.load(FP_CITE_TRAIN_TARGETS_IDXCOL,allow_pickle=True)["columns"]

    # select inputs columns, targets columns, indices
    inputs_random,targets_random,selected_indices = row_col_select(train_inputs,train_targets,inputs_cols,targets_cols,seed, num_of_inputs_columns=50, num_of_targets_columns=3)
    ```
    '''
    # initialization
    inputs_random,targets_random,selected_indices = None,None,None

    # set seed
    np.random.seed(seed=seed) #40

    # shape confirmation
    print(f'train_inputs shape: {train_inputs.shape}')
    print(f'train_targets shape: {train_targets.shape}')

    # inputs column selection
    if num_of_inputs_columns:
        column_idx_inputs = np.arange(train_inputs.shape[1])
        np.random.shuffle(column_idx_inputs)
        inputs_random = list(inputs_cols[column_idx_inputs[:num_of_inputs_columns]])
        print(f'inputs_random: {inputs_random}')

    # targets column selection
    if num_of_targets_columns:
        column_idx_targets = np.arange(train_targets.shape[1])
        np.random.shuffle(column_idx_targets)
        targets_random = list(targets_cols[column_idx_targets[:num_of_targets_columns]])
        print(f'targets_random: {targets_random}')

    # row selection
    row_idx_inputs = np.arange(train_inputs.shape[0])
    np.random.shuffle(row_idx_inputs)
    if num_of_indices:
        selected_indices = list(row_idx_inputs[:num_of_indices])
    else:
        selected_indices = list(row_idx_inputs[:])
    # print(f'selected_indices: {selected_indices}\n')
    return inputs_random,targets_random,selected_indices

def slice_sparse_matrix_by_columns_list(sparse_matrix, all_columns, sub_columns):
    '''
    # examples of use:
    >>> sparse_matrix = csc_matrix(scipy.sparse.load_npz(FP_MULTI_TRAIN_INPUTS_VALUES))
    >>> all_columns = np.load(FP_MULTI_TRAIN_INPUTS_IDXCOL,allow_pickle=True)["columns"]
    >>> sub_columns = ['chr6:53258500-53259336', 'chr6:53287338-53288266',]
    >>> chr6_data = slice_sparse_matrix_by_columns_list(sparse_matrix, all_columns, sub_columns)
    '''
    assert sparse_matrix.shape[1] == len(all_columns), 'column list size is not appropriate'
    dic = dict((k,v) for v,k in enumerate(all_columns))
    sub_columns_indices = [dic.get(col) for col in sub_columns]
    return sparse_matrix[:,sub_columns_indices]



# this is not used in this notebook
# def drop_columns_of_sparse(sparse_matrix, all_columns, drop_columns):
#     assert sparse_matrix.shape[1] == len(all_columns), 'column list size is not appropriate'
#     dic = dict((k,v) for v,k in enumerate(all_columns))
#     remaining_columns = [col for col in all_columns if col not in drop_columns]
#     remaining_columns_indices = [dic.get(col) for col in remaining_columns]
#     return sparse_matrix[:,remaining_columns_indices]

In [None]:
targets_seeds = range(1000,1010)
inputs_seeds = range(0,10)
score_df = pd.DataFrame(index=[f'inputs seed{col}' for col in inputs_seeds]) 

if is_cite:
    # citeseq config
    num_of_targets_columns = 5
    num_of_inputs_columns = 200
    num_of_indices=10000
else:
    # multiome config
    num_of_targets_columns = 20
    num_of_inputs_columns = 1000
    num_of_indices=100
    n_components = 5

for targets_seed in tqdm(targets_seeds):
    # initialization
    score_list = []

    # targets columns selection
    _, targets_random ,_ = row_col_select(
    train_inputs,train_targets,inputs_cols,targets_cols,seed=targets_seed,
    num_of_targets_columns = num_of_targets_columns,
    )

    # inputs seeds loop
    for inputs_seed in tqdm(inputs_seeds):
        # inputs columns & indices selection
        inputs_random, _ ,selected_indices = row_col_select(
            train_inputs,train_targets,inputs_cols,targets_cols,seed=inputs_seed,
            num_of_inputs_columns=num_of_inputs_columns, num_of_indices=num_of_indices
            )
        
        # train
        X = slice_sparse_matrix_by_columns_list(train_inputs[selected_indices], inputs_cols, inputs_random)
        y = slice_sparse_matrix_by_columns_list(train_targets[selected_indices], targets_cols, targets_random)

        if not is_cite:
            pcaX = TruncatedSVD(n_components=n_components)
            X = pcaX.fit_transform(X)
            print(np.sum(pcaX.explained_variance_ratio_))
        if type(X) != np.ndarray: X = X.toarray()
        if type(y) != np.ndarray: y = y.toarray()
        Xtr, Xva = train_test_split(X, shuffle=True,random_state=42)
        ytr, yva = train_test_split(y, shuffle=True,random_state=42)
        
        model.fit(Xtr, ytr)
        del Xtr, ytr
        gc.collect()
        print(f'X shape: {X.shape}')
        print(f'y shape: {y.shape}')
        preds = model.predict(Xva)
        # print(Back.YELLOW + Style.BRIGHT + f'score: {correlation_score(yva.toarray(),preds)}\n')
        r = correlation_score(yva,preds)
        score_list.append(r)
        print(f'r: {r}')
        print(f'score_list: {score_list}')
    #     break
    score_df[f'protein seed{targets_seed}'] = score_list
    # break

display(score_df)
score_df.to_csv(FP_OUTPUT_DIR + 'score_df.csv',index=False)

Output hidden; open in https://colab.research.google.com to view.