## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 726 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


In [None]:
import pandas as pd
import random
import os
import numpy as np

# Encoding
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder

# Scaling
from sklearn.preprocessing import StandardScaler

# Clustering
from sklearn.cluster import KMeans

# Visualization
import matplotlib.pyplot  as plt
import seaborn as sns

# Count
from collections import Counter

In [None]:
class CFG:
    SEED = 42

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## Data Load

In [None]:
train = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/test.csv')
snp_info = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/snp_info.csv')

In [None]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [None]:
train_x, y_train = get_x_y(train)
test_x = get_x_y(test)

In [None]:
X_train = train_x.iloc[:,3:]
X_test = test_x.iloc[:,3:]

### 파생변수생성

#### SNP 분할

In [None]:
np.unique(X_train.iloc[:,1:].values)

array(['A A', 'A G', 'C A', 'C C', 'G A', 'G G'], dtype=object)

In [None]:
X_list = X_train.iloc[:,1:].columns.to_list()

In [None]:
for i in X_list:
    X_train[f'{i}_0'] = X_train[f'{i}'].astype('str').apply(lambda x: x.split(' ')).str[0]
for i in X_list:
    X_train[f'{i}_1'] = X_train[f'{i}'].astype('str').apply(lambda x: x.split(' ')).str[1]

In [None]:
for i in X_list:
    X_test[f'{i}_0'] = X_test[f'{i}'].astype('str').apply(lambda x: x.split(' ')).str[0]
for i in X_list:
    X_test[f'{i}_1'] = X_test[f'{i}'].astype('str').apply(lambda x: x.split(' ')).str[1]

In [None]:
X_train.iloc[:,16:].head()

Unnamed: 0,SNP_01_0,SNP_02_0,SNP_03_0,SNP_04_0,SNP_05_0,SNP_06_0,SNP_07_0,SNP_08_0,SNP_09_0,SNP_10_0,...,SNP_06_1,SNP_07_1,SNP_08_1,SNP_09_1,SNP_10_1,SNP_11_1,SNP_12_1,SNP_13_1,SNP_14_1,SNP_15_1
0,G,A,A,G,C,A,A,G,A,G,...,A,A,G,A,G,G,A,A,A,A
1,A,A,C,A,A,A,A,G,A,A,...,G,A,A,A,G,A,A,G,A,A
2,G,G,A,G,C,G,A,G,G,A,...,G,A,A,A,G,A,A,A,A,A
3,A,G,A,G,A,G,G,A,G,A,...,G,G,A,G,G,G,G,G,A,G
4,G,G,C,A,C,A,A,A,A,G,...,A,A,A,A,G,A,A,G,A,A


#### SNP 합

In [None]:
chrom = pd.DataFrame(snp_info.iloc[:,2])
cm = pd.DataFrame(snp_info.iloc[:,3])
pos = pd.DataFrame(snp_info.iloc[:,4])
chorm_cm = pd.concat([chrom,cm],axis=1)
chorm_pos = pd.concat([chrom,pos],axis=1)
cm_pos = pd.concat([chrom,cm],axis=1)
chrom_cm_pos = pd.concat([chrom,cm,pos],axis=1)
info_list = [chrom,cm,pos,chorm_cm,chorm_pos,cm_pos,chrom_cm_pos]

In [None]:
predict = pd.DataFrame()
for i in range(len(info_list)):
  model = KMeans(n_clusters=6, random_state=CFG.SEED)
  model.fit(info_list[i])
  predict = pd.concat([predict,pd.DataFrame(model.predict(info_list[i]))],axis=1)
predict.columns = ['chrom_clust','cm_clust','pos_clust','chorm_cm_clust','chorm_pos_clust','cm_pos_clust','chrom_cm_pos_clust']
snp_info = pd.concat([snp_info,predict], axis=1)

In [None]:
chorm_list = []
cm_list = []
pos_list = []
chrom_cm_list = []
chrom_pos_list = []
cm_pos_list = []
chrom_cm_pos_list = []

for i in range(6):
  chorm_list.append(snp_info.query(f'chrom_clust == {i}').SNP_id.to_list())
for i in range(6):
  cm_list.append(snp_info.query(f'cm_clust == {i}').SNP_id.to_list())
for i in range(6):
  pos_list.append(snp_info.query(f'pos_clust == {i}').SNP_id.to_list())
for i in range(6):
  chrom_cm_list.append(snp_info.query(f'chorm_cm_clust == {i}').SNP_id.to_list())
for i in range(6):
  chrom_pos_list.append(snp_info.query(f'chorm_pos_clust == {i}').SNP_id.to_list())
for i in range(6):
  cm_pos_list.append(snp_info.query(f'cm_pos_clust == {i}').SNP_id.to_list())
for i in range(6):
  chrom_cm_pos_list.append(snp_info.query(f'chrom_cm_pos_clust == {i}').SNP_id.to_list())

In [None]:
all_list = [chorm_list,cm_list,pos_list,chrom_cm_list,chrom_pos_list,cm_pos_list,chrom_cm_pos_list]
all_columns = predict.columns.to_list()

In [None]:
for j in range(7):
  for i in range(6):
    if len(all_list[j][i]) == 8:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]] + ' ' + X_train[all_list[j][i][4]] + ' ' + X_train[all_list[j][i][5]] + ' ' + X_train[all_list[j][i][6]] + ' ' + X_train[all_list[j][i][7]]
    elif len(all_list[j][i]) == 7:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]] + ' ' + X_train[all_list[j][i][4]] + ' ' + X_train[all_list[j][i][5]] + ' ' + X_train[all_list[j][i][6]]
    elif len(all_list[j][i]) == 6:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]] + ' ' + X_train[all_list[j][i][4]] + ' ' + X_train[all_list[j][i][5]]
    elif len(all_list[j][i]) == 5:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]] + ' ' + X_train[all_list[j][i][4]]
    elif len(all_list[j][i]) == 4:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]]
    elif len(all_list[j][i]) == 3:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]]
    elif len(all_list[j][i]) == 2:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]]

In [None]:
for j in range(7):
  for i in range(6):
    if len(all_list[j][i]) == 8:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]] + ' ' + X_test[all_list[j][i][4]] + ' ' + X_test[all_list[j][i][5]] + ' ' + X_test[all_list[j][i][6]] + ' ' + X_test[all_list[j][i][7]]
    elif len(all_list[j][i]) == 7:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]] + ' ' + X_test[all_list[j][i][4]] + ' ' + X_test[all_list[j][i][5]] + ' ' + X_test[all_list[j][i][6]]
    elif len(all_list[j][i]) == 6:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]] + ' ' + X_test[all_list[j][i][4]] + ' ' + X_test[all_list[j][i][5]]
    elif len(all_list[j][i]) == 5:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]] + ' ' + X_test[all_list[j][i][4]]
    elif len(all_list[j][i]) == 4:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]]
    elif len(all_list[j][i]) == 3:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]]
    elif len(all_list[j][i]) == 2:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]]

In [None]:
X_train.iloc[:,46:].head()

Unnamed: 0,chrom_clust0,chrom_clust4,cm_clust0,cm_clust4,pos_clust0,pos_clust2,pos_clust4,chorm_cm_clust0,chorm_cm_clust3,chorm_cm_clust5,chorm_pos_clust0,chorm_pos_clust2,chorm_pos_clust4,cm_pos_clust0,cm_pos_clust3,cm_pos_clust5,chrom_cm_pos_clust0,chrom_cm_pos_clust1,chrom_cm_pos_clust3
0,A G A A G A C A A A A A G G A A,A A A A A A,G G A A G G A A A A A A,G A C A A A A A G G,G A C A A A A A A A A A,G G A A G G,G G A A A A,G G A A A A A A,G A C A A A A A G G A A,G G A A,G A C A A A A A A A A A,G G A A G G,G G A A A A,G G A A A A A A,G A C A A A A A G G A A,G G A A,G G A A G G,G G A A A A,G A C A A A A A A A A A
1,A G C A A A A A A G A A G A A A,G A G G A A,A G C A A G G A G G A A,A A A A A G A A G A,A A A A A G A A G A G G,A G C A A G,G A A A A A,A G G A G G A A,A A A A A G A A G A A A,A G C A,A A A A A G A A G A G G,A G C A A G,G A A A A A,A G G A G G A A,A A A A A G A A G A A A,A G C A,A G C A A G,G A A A A A,A A A A A G A A G A G G
2,G G A A G A C C G G A A G A G A,A A A A A A,G G A A A G A A A A A A,G A C C G G A A G A,G A C C G G A A A A A A,G G A A A G,G A G A A A,A G A A A A A A,G A C C G G A A G A G A,G G A A,G A C C G G A A A A A A,G G A A A G,G A G A A A,A G A A A A A A,G A C C G G A A G A G A,G G A A,G G A A A G,G A G A A A,G A C C G G A A A A A A
3,G G A A G A A A G G G G A A G G,G G G G A A,A A A A A G G G G G A A,G A A A G G G G A A,G A A A G G G G G G G G,A A A A A G,A A G G A A,A G G G G G A A,G A A A G G G G A A G G,A A A A,G A A A G G G G G G G G,A A A A A G,A A G G A A,A G G G G G A A,G A A A G G G G A A G G,A A A A,A A A A A G,A A G G A A,G A A A G G G G G G G G
4,G G C C A A C C A A A A A A A A,A A A G A A,G G C C G G A A A G A A,A A C C A A A A A A,A A C C A A A A A A A G,G G C C G G,A A A A A A,G G A A A G A A,A A C C A A A A A A A A,G G C C,A A C C A A A A A A A G,G G C C G G,A A A A A A,G G A A A G A A,A A C C A A A A A A A A,G G C C,G G C C G G,A A A A A A,A A C C A A A A A A A G


#### SNP A,C,G 개수

In [None]:
X_train['A_count'] = Counter(X_train.iloc[:,16:46].iloc[0,:])['A']
X_train['C_count'] = Counter(X_train.iloc[:,16:46].iloc[0,:])['C']
X_train['G_count'] = Counter(X_train.iloc[:,16:46].iloc[0,:])['G']

In [None]:
X_test['A_count'] = Counter(X_test.iloc[:,16:46].iloc[0,:])['A']
X_test['C_count'] = Counter(X_test.iloc[:,16:46].iloc[0,:])['C']
X_test['G_count'] = Counter(X_test.iloc[:,16:46].iloc[0,:])['G']

In [None]:
for i in range(len(X_train.iloc[:,16:].index)):
    X_train['A_count'].iloc[i] = Counter(X_train.iloc[:,16:46].iloc[i,:])['A']
for i in range(len(X_train.iloc[:,16:].index)):
    X_train['C_count'].iloc[i] = Counter(X_train.iloc[:,16:46].iloc[i,:])['C']
for i in range(len(X_train.iloc[:,16:].index)):
    X_train['G_count'].iloc[i] = Counter(X_train.iloc[:,16:46].iloc[i,:])['G']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
for i in range(len(X_test.iloc[:,16:].index)):
    X_test['A_count'].iloc[i] = Counter(X_test.iloc[:,16:46].iloc[i,:])['A']
for i in range(len(X_test.iloc[:,16:].index)):
    X_test['C_count'].iloc[i] = Counter(X_test.iloc[:,16:46].iloc[i,:])['C']
for i in range(len(X_test.iloc[:,16:].index)):
    X_test['G_count'].iloc[i] = Counter(X_test.iloc[:,16:46].iloc[i,:])['G']

In [None]:
X_train['A+C_count'] = X_train['A_count']+X_train['C_count']
X_train['A+G_count'] = X_train['A_count']+X_train['G_count']
X_train['C+G_count'] = X_train['C_count']+X_train['G_count']

In [None]:
X_test['A+C_count'] = X_test['A_count']+X_test['C_count']
X_test['A+G_count'] = X_test['A_count']+X_test['G_count']
X_test['C+G_count'] = X_test['C_count']+X_test['G_count']

In [None]:
X_train.iloc[:,65:].head()

Unnamed: 0,A_count,C_count,G_count,A+C_count,A+G_count,C+G_count
0,20,1,9,21,29,10
1,21,1,8,22,29,9
2,18,2,10,20,28,12
3,12,0,18,12,30,18
4,18,4,8,22,26,12


#### trait 문자형 변환

In [None]:
X_train['trait'] = X_train['trait'].astype(str)
X_test['trait'] = X_test['trait'].astype(str)

In [None]:
Cat_list = X_train.select_dtypes(include='object').columns.to_list()
Num_list = X_train.select_dtypes(exclude='object').columns.to_list()

In [None]:
X_train.shape, X_test.shape, y_train.shape

((262, 71), (175, 71), (262,))

### Scaling

In [None]:
stscaler = StandardScaler()
X_train[Num_list] = stscaler.fit_transform(X_train[Num_list])
X_test[Num_list] = stscaler.transform(X_test[Num_list])

In [None]:
X_train.head()

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,...,cm_pos_clust5,chrom_cm_pos_clust0,chrom_cm_pos_clust1,chrom_cm_pos_clust3,A_count,C_count,G_count,A+C_count,A+G_count,C+G_count
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,...,G G A A,G G A A G G,G G A A A A,G A C A A A A A A A A A,0.986104,-1.002583,-0.663529,0.663529,1.002583,-0.986104
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,...,A G C A,A G C A A G,G A A A A A,A A A A A G A A G A G G,1.270327,-1.002583,-0.938165,0.938165,1.002583,-1.270327
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,...,G G A A,G G A A A G,G A G A A A,G A C C G G A A A A A A,0.417657,-0.050856,-0.388893,0.388893,0.050856,-0.417657
3,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,...,A A A A,A A A A A G,A A G G A A,G A A A G G G G G G G G,-1.287685,-1.95431,1.808195,-1.808195,1.95431,1.287685
4,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,...,G G C C,G G C C G G,A A A A A A,A A C C A A A A A A A G,0.417657,1.852599,-0.938165,0.938165,-1.852599,-0.417657


### Encoding

In [None]:
def catboost_encode_multiclass(X,X_t,y):
    y=y.astype(str)
    enc=ce.OneHotEncoder().fit(y)
    y_onehot=enc.transform(y)
    class_names=y_onehot.columns
    X_obj=X.select_dtypes('object')
    X_t_obj=X_t.select_dtypes('object')
    X=X.select_dtypes(exclude='object')
    X_t=X_t.select_dtypes(exclude='object') 
    for class_ in class_names:
        enc=ce.CatBoostEncoder()
        enc.fit(X_obj,y_onehot[class_])
        temp=enc.transform(X_obj)
        temp_t=enc.transform(X_t_obj)
        temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
        temp_t.columns=[str(x)+'_'+str(class_) for x in temp_t.columns]
        X=pd.concat([X,temp],axis=1)
        X_t=pd.concat([X_t,temp_t],axis=1)
      
    return X, X_t

In [None]:
X_train, X_test = catboost_encode_multiclass(X_train,X_test,y_train)

In [None]:
X_train.shape, X_test.shape

((262, 201), (175, 201))

### Clustering

In [None]:
for i in range(3,7):
  clust = KMeans(n_clusters=i, random_state=CFG.SEED)
  clust.fit(X_train.iloc[:,:201])
  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:201])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:201])

  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:201])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:201])
  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:201])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:201])
  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:201])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:201])
  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:201])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:201])


In [None]:
X_train.shape, X_test.shape, y_train.shape

((262, 205), (175, 205), (262,))

## Submission

In [None]:
X_train.to_csv('/content/drive/MyDrive/유전체공모전/data/X_train_new.csv', index=False)
X_test.to_csv('/content/drive/MyDrive/유전체공모전/data/X_test_new.csv', index=False)
y_train.to_csv('/content/drive/MyDrive/유전체공모전/data/y_train_new.csv', index=False)