## Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install category_encoders
!pip install imblearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.5.1.post0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import random
import os
import numpy as np

# Encoding
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder

# Scaling
from sklearn.preprocessing import StandardScaler

# Clustering
from sklearn.cluster import KMeans

# Visualization
import matplotlib.pyplot  as plt
import seaborn as sns

# Count
from collections import Counter

#Oversampling
from imblearn.over_sampling import SMOTE, ADASYN

In [None]:
class CFG:
    SEED = 42

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG.SEED) # Seed 고정

## Data Load

In [None]:
train = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/test.csv')
snp_info = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/snp_info.csv')

In [None]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [None]:
train_x, y_train = get_x_y(train)
test_x = get_x_y(test)

In [None]:
X_train = train_x.iloc[:,3:]
X_test = test_x.iloc[:,3:]

## Feature_set_01

### 파생변수생성

#### SNP 분할

In [None]:
np.unique(X_train.iloc[:,1:].values)

array(['A A', 'A G', 'C A', 'C C', 'G A', 'G G'], dtype=object)

In [None]:
X_list = X_train.iloc[:,1:].columns.to_list()

In [None]:
for i in X_list:
    X_train[f'{i}_0'] = X_train[f'{i}'].astype('str').apply(lambda x: x.split(' ')).str[0]
for i in X_list:
    X_train[f'{i}_1'] = X_train[f'{i}'].astype('str').apply(lambda x: x.split(' ')).str[1]

In [None]:
for i in X_list:
    X_test[f'{i}_0'] = X_test[f'{i}'].astype('str').apply(lambda x: x.split(' ')).str[0]
for i in X_list:
    X_test[f'{i}_1'] = X_test[f'{i}'].astype('str').apply(lambda x: x.split(' ')).str[1]

In [None]:
X_train.iloc[:,16:].head()

Unnamed: 0,SNP_01_0,SNP_02_0,SNP_03_0,SNP_04_0,SNP_05_0,SNP_06_0,SNP_07_0,SNP_08_0,SNP_09_0,SNP_10_0,...,SNP_06_1,SNP_07_1,SNP_08_1,SNP_09_1,SNP_10_1,SNP_11_1,SNP_12_1,SNP_13_1,SNP_14_1,SNP_15_1
0,G,A,A,G,C,A,A,G,A,G,...,A,A,G,A,G,G,A,A,A,A
1,A,A,C,A,A,A,A,G,A,A,...,G,A,A,A,G,A,A,G,A,A
2,G,G,A,G,C,G,A,G,G,A,...,G,A,A,A,G,A,A,A,A,A
3,A,G,A,G,A,G,G,A,G,A,...,G,G,A,G,G,G,G,G,A,G
4,G,G,C,A,C,A,A,A,A,G,...,A,A,A,A,G,A,A,G,A,A


#### SNP 합

In [None]:
chrom = pd.DataFrame(snp_info.iloc[:,2])
cm = pd.DataFrame(snp_info.iloc[:,3])
pos = pd.DataFrame(snp_info.iloc[:,4])
chorm_cm = pd.concat([chrom,cm],axis=1)
chorm_pos = pd.concat([chrom,pos],axis=1)
cm_pos = pd.concat([chrom,cm],axis=1)
chrom_cm_pos = pd.concat([chrom,cm,pos],axis=1)
info_list = [chrom,cm,pos,chorm_cm,chorm_pos,cm_pos,chrom_cm_pos]

In [None]:
predict = pd.DataFrame()
for i in range(len(info_list)):
  model = KMeans(n_clusters=6, random_state=CFG.SEED)
  model.fit(info_list[i])
  predict = pd.concat([predict,pd.DataFrame(model.predict(info_list[i]))],axis=1)
predict.columns = ['chrom_clust','cm_clust','pos_clust','chorm_cm_clust','chorm_pos_clust','cm_pos_clust','chrom_cm_pos_clust']
snp_info = pd.concat([snp_info,predict], axis=1)

In [None]:
chorm_list = []
cm_list = []
pos_list = []
chrom_cm_list = []
chrom_pos_list = []
cm_pos_list = []
chrom_cm_pos_list = []

for i in range(6):
  chorm_list.append(snp_info.query(f'chrom_clust == {i}').SNP_id.to_list())
for i in range(6):
  cm_list.append(snp_info.query(f'cm_clust == {i}').SNP_id.to_list())
for i in range(6):
  pos_list.append(snp_info.query(f'pos_clust == {i}').SNP_id.to_list())
for i in range(6):
  chrom_cm_list.append(snp_info.query(f'chorm_cm_clust == {i}').SNP_id.to_list())
for i in range(6):
  chrom_pos_list.append(snp_info.query(f'chorm_pos_clust == {i}').SNP_id.to_list())
for i in range(6):
  cm_pos_list.append(snp_info.query(f'cm_pos_clust == {i}').SNP_id.to_list())
for i in range(6):
  chrom_cm_pos_list.append(snp_info.query(f'chrom_cm_pos_clust == {i}').SNP_id.to_list())

In [None]:
all_list = [chorm_list,cm_list,pos_list,chrom_cm_list,chrom_pos_list,cm_pos_list,chrom_cm_pos_list]
all_columns = predict.columns.to_list()

In [None]:
for j in range(7):
  for i in range(6):
    if len(all_list[j][i]) == 8:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]] + ' ' + X_train[all_list[j][i][4]] + ' ' + X_train[all_list[j][i][5]] + ' ' + X_train[all_list[j][i][6]] + ' ' + X_train[all_list[j][i][7]]
    elif len(all_list[j][i]) == 7:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]] + ' ' + X_train[all_list[j][i][4]] + ' ' + X_train[all_list[j][i][5]] + ' ' + X_train[all_list[j][i][6]]
    elif len(all_list[j][i]) == 6:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]] + ' ' + X_train[all_list[j][i][4]] + ' ' + X_train[all_list[j][i][5]]
    elif len(all_list[j][i]) == 5:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]] + ' ' + X_train[all_list[j][i][4]]
    elif len(all_list[j][i]) == 4:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]] + ' ' + X_train[all_list[j][i][3]]
    elif len(all_list[j][i]) == 3:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]] + ' ' + X_train[all_list[j][i][2]]
    elif len(all_list[j][i]) == 2:
      X_train[f'{all_columns[j]}{i}'] = X_train[all_list[j][i][0]] + ' ' + X_train[all_list[j][i][1]]

In [None]:
for j in range(7):
  for i in range(6):
    if len(all_list[j][i]) == 8:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]] + ' ' + X_test[all_list[j][i][4]] + ' ' + X_test[all_list[j][i][5]] + ' ' + X_test[all_list[j][i][6]] + ' ' + X_test[all_list[j][i][7]]
    elif len(all_list[j][i]) == 7:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]] + ' ' + X_test[all_list[j][i][4]] + ' ' + X_test[all_list[j][i][5]] + ' ' + X_test[all_list[j][i][6]]
    elif len(all_list[j][i]) == 6:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]] + ' ' + X_test[all_list[j][i][4]] + ' ' + X_test[all_list[j][i][5]]
    elif len(all_list[j][i]) == 5:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]] + ' ' + X_test[all_list[j][i][4]]
    elif len(all_list[j][i]) == 4:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]] + ' ' + X_test[all_list[j][i][3]]
    elif len(all_list[j][i]) == 3:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]] + ' ' + X_test[all_list[j][i][2]]
    elif len(all_list[j][i]) == 2:
      X_test[f'{all_columns[j]}{i}'] = X_test[all_list[j][i][0]] + ' ' + X_test[all_list[j][i][1]]

In [None]:
X_train.iloc[:,46:].head()

Unnamed: 0,chrom_clust0,chrom_clust4,cm_clust0,cm_clust4,pos_clust0,pos_clust2,pos_clust4,chorm_cm_clust0,chorm_cm_clust3,chorm_cm_clust5,chorm_pos_clust0,chorm_pos_clust2,chorm_pos_clust4,cm_pos_clust0,cm_pos_clust3,cm_pos_clust5,chrom_cm_pos_clust0,chrom_cm_pos_clust1,chrom_cm_pos_clust3
0,A G A A G A C A A A A A G G A A,A A A A A A,G G A A G G A A A A A A,G A C A A A A A G G,G A C A A A A A A A A A,G G A A G G,G G A A A A,G G A A A A A A,G A C A A A A A G G A A,G G A A,G A C A A A A A A A A A,G G A A G G,G G A A A A,G G A A A A A A,G A C A A A A A G G A A,G G A A,G G A A G G,G G A A A A,G A C A A A A A A A A A
1,A G C A A A A A A G A A G A A A,G A G G A A,A G C A A G G A G G A A,A A A A A G A A G A,A A A A A G A A G A G G,A G C A A G,G A A A A A,A G G A G G A A,A A A A A G A A G A A A,A G C A,A A A A A G A A G A G G,A G C A A G,G A A A A A,A G G A G G A A,A A A A A G A A G A A A,A G C A,A G C A A G,G A A A A A,A A A A A G A A G A G G
2,G G A A G A C C G G A A G A G A,A A A A A A,G G A A A G A A A A A A,G A C C G G A A G A,G A C C G G A A A A A A,G G A A A G,G A G A A A,A G A A A A A A,G A C C G G A A G A G A,G G A A,G A C C G G A A A A A A,G G A A A G,G A G A A A,A G A A A A A A,G A C C G G A A G A G A,G G A A,G G A A A G,G A G A A A,G A C C G G A A A A A A
3,G G A A G A A A G G G G A A G G,G G G G A A,A A A A A G G G G G A A,G A A A G G G G A A,G A A A G G G G G G G G,A A A A A G,A A G G A A,A G G G G G A A,G A A A G G G G A A G G,A A A A,G A A A G G G G G G G G,A A A A A G,A A G G A A,A G G G G G A A,G A A A G G G G A A G G,A A A A,A A A A A G,A A G G A A,G A A A G G G G G G G G
4,G G C C A A C C A A A A A A A A,A A A G A A,G G C C G G A A A G A A,A A C C A A A A A A,A A C C A A A A A A A G,G G C C G G,A A A A A A,G G A A A G A A,A A C C A A A A A A A A,G G C C,A A C C A A A A A A A G,G G C C G G,A A A A A A,G G A A A G A A,A A C C A A A A A A A A,G G C C,G G C C G G,A A A A A A,A A C C A A A A A A A G


#### SNP A,C,G 개수

In [None]:
X_train['A_count'] = Counter(X_train.iloc[:,16:46].iloc[0,:])['A']
X_train['C_count'] = Counter(X_train.iloc[:,16:46].iloc[0,:])['C']
X_train['G_count'] = Counter(X_train.iloc[:,16:46].iloc[0,:])['G']

In [None]:
X_test['A_count'] = Counter(X_test.iloc[:,16:46].iloc[0,:])['A']
X_test['C_count'] = Counter(X_test.iloc[:,16:46].iloc[0,:])['C']
X_test['G_count'] = Counter(X_test.iloc[:,16:46].iloc[0,:])['G']

In [None]:
for i in range(len(X_train.iloc[:,16:].index)):
    X_train['A_count'].iloc[i] = Counter(X_train.iloc[:,16:46].iloc[i,:])['A']
for i in range(len(X_train.iloc[:,16:].index)):
    X_train['C_count'].iloc[i] = Counter(X_train.iloc[:,16:46].iloc[i,:])['C']
for i in range(len(X_train.iloc[:,16:].index)):
    X_train['G_count'].iloc[i] = Counter(X_train.iloc[:,16:46].iloc[i,:])['G']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
for i in range(len(X_test.iloc[:,16:].index)):
    X_test['A_count'].iloc[i] = Counter(X_test.iloc[:,16:46].iloc[i,:])['A']
for i in range(len(X_test.iloc[:,16:].index)):
    X_test['C_count'].iloc[i] = Counter(X_test.iloc[:,16:46].iloc[i,:])['C']
for i in range(len(X_test.iloc[:,16:].index)):
    X_test['G_count'].iloc[i] = Counter(X_test.iloc[:,16:46].iloc[i,:])['G']

In [None]:
X_train['A+C_count'] = X_train['A_count']+X_train['C_count']
X_train['A+G_count'] = X_train['A_count']+X_train['G_count']
X_train['C+G_count'] = X_train['C_count']+X_train['G_count']

In [None]:
X_test['A+C_count'] = X_test['A_count']+X_test['C_count']
X_test['A+G_count'] = X_test['A_count']+X_test['G_count']
X_test['C+G_count'] = X_test['C_count']+X_test['G_count']

In [None]:
X_train.iloc[:,65:].head()

Unnamed: 0,A_count,C_count,G_count,A+C_count,A+G_count,C+G_count
0,20,1,9,21,29,10
1,21,1,8,22,29,9
2,18,2,10,20,28,12
3,12,0,18,12,30,18
4,18,4,8,22,26,12


#### trait 문자형 변환

In [None]:
X_train['trait'] = X_train['trait'].astype(str)
X_test['trait'] = X_test['trait'].astype(str)

In [None]:
Cat_list = X_train.select_dtypes(include='object').columns.to_list()
Num_list = X_train.select_dtypes(exclude='object').columns.to_list()

In [None]:
X_train.shape, X_test.shape, y_train.shape

((262, 71), (175, 71), (262,))

In [None]:
X_train_01 = X_train.copy()
X_test_01 = X_test.copy()

In [None]:
X_train

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,...,cm_pos_clust5,chrom_cm_pos_clust0,chrom_cm_pos_clust1,chrom_cm_pos_clust3,A_count,C_count,G_count,A+C_count,A+G_count,C+G_count
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,...,G G A A,G G A A G G,G G A A A A,G A C A A A A A A A A A,20,1,9,21,29,10
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,...,A G C A,A G C A A G,G A A A A A,A A A A A G A A G A G G,21,1,8,22,29,9
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,...,G G A A,G G A A A G,G A G A A A,G A C C G G A A A A A A,18,2,10,20,28,12
3,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,...,A A A A,A A A A A G,A A G G A A,G A A A G G G G G G G G,12,0,18,12,30,18
4,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,...,G G C C,G G C C G G,A A A A A A,A A C C A A A A A A A G,18,4,8,22,26,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,...,A G A A,A G A A G G,G A A A A A,G A C C A G A A G A A A,19,2,9,21,28,11
258,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,...,G G C A,G G C A A G,G A A A A A,A A A A A G G A A A A G,20,1,9,21,29,10
259,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,...,A G A A,A G A A A A,G A G A C A,G A A A A G G G G G G G,12,1,17,13,29,18
260,1,A A,G G,A A,G A,A A,G G,G G,A A,G A,...,A A A A,A A A A A G,A A G A C A,G A A A G G G G G A G G,14,1,15,15,29,16


In [None]:
X_train.nunique().sort_values(ascending=False).head(30)

chrom_clust0           179
chrom_cm_pos_clust3    116
pos_clust0             116
chorm_pos_clust0       116
cm_clust0              106
cm_pos_clust3          101
chorm_cm_clust3        101
cm_clust4               78
chorm_cm_clust0         39
cm_pos_clust0           39
pos_clust2              21
chorm_pos_clust2        21
chrom_cm_pos_clust0     21
chrom_clust4            20
A+C_count               19
G_count                 19
pos_clust4              18
A_count                 18
chorm_pos_clust4        18
chrom_cm_pos_clust1     18
C+G_count               18
cm_pos_clust5            8
chorm_cm_clust5          8
A+G_count                6
C_count                  6
SNP_09                   3
SNP_08                   3
SNP_03                   3
SNP_04                   3
SNP_05                   3
dtype: int64

## Feature_set_02

In [None]:
train = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/test.csv')
snp = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/snp_info.csv')

In [None]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [None]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

In [None]:
train_x.drop(columns=['father','mother','gender'],inplace=True)
test_x.drop(columns=['father','mother','gender'],inplace=True)

In [None]:
train_x['2_BTA'] = train_x['SNP_01']
train_x['chrom_6'] = train_x['SNP_02'] + '-'+ train_x['SNP_03'] + '-' + train_x['SNP_04'] + '-' + train_x['SNP_05'] + '-' + train_x['SNP_06'] + '-' + train_x['SNP_07'] + '-' + train_x['SNP_08'] + '-' + train_x['SNP_09']
train_x['6_ARS_Parent'] = train_x['SNP_02']
train_x['6_ARS_BFGL'] = train_x['SNP_03'] + '-' + train_x['SNP_04'] + '-' + train_x['SNP_09']
train_x['6_BOVINE'] = train_x['SNP_05'] + '-' + train_x['SNP_06'] + '-' + train_x['SNP_08']
train_x['6_HAPMAP'] = train_x['SNP_07']
train_x['7_BTB'] = train_x['SNP_10']
train_x['8_ARS'] = train_x['SNP_11']
train_x['chrom_9'] = train_x['SNP_12'] + '-' + train_x['SNP_13'] + '-' + train_x['SNP_14']
train_x['9_HAPMAP'] = train_x['SNP_12'] + '-' + train_x['SNP_14']
train_x['9_BTB'] = train_x['SNP_13']
train_x['10_BOVINE'] = train_x['SNP_15']
train_x['SNP_total'] = train_x['SNP_01'] + '-' + train_x['SNP_02'] + '-' + train_x['SNP_03'] + '-' + train_x['SNP_04'] + '-' + train_x['SNP_05'] + '-' + train_x['SNP_06'] + '-' + train_x['SNP_07'] + '-' + train_x['SNP_08'] + '-' + train_x['SNP_09'] + '-' + train_x['SNP_10'] + '-' + train_x['SNP_11'] + '-' + train_x['SNP_12'] + '-' + train_x['SNP_13'] + '-' + train_x['SNP_14'] + '-' + train_x['SNP_15']

In [None]:
test_x['2_BTA'] = test_x['SNP_01']
test_x['chrom_6'] = test_x['SNP_02'] + '-'+ test_x['SNP_03'] + '-' + test_x['SNP_04'] + '-' + test_x['SNP_05'] + '-' + test_x['SNP_06'] + '-' + test_x['SNP_07'] + '-' + test_x['SNP_08'] + '-' + test_x['SNP_09']
test_x['6_ARS_Parent'] = test_x['SNP_02']
test_x['6_ARS_BFGL'] = test_x['SNP_03'] + '-' + test_x['SNP_04'] + '-' + test_x['SNP_09']
test_x['6_BOVINE'] = test_x['SNP_05'] + '-' + test_x['SNP_06'] + '-' + test_x['SNP_08']
test_x['6_HAPMAP'] = test_x['SNP_07']
test_x['7_BTB'] = test_x['SNP_10']
test_x['8_ARS'] = test_x['SNP_11']
test_x['chrom_9'] = test_x['SNP_12'] + '-' + test_x['SNP_13'] + '-' + test_x['SNP_14']
test_x['9_HAPMAP'] = test_x['SNP_12'] + '-' + test_x['SNP_14']
test_x['9_BTB'] = test_x['SNP_13']
test_x['10_BOVINE'] = test_x['SNP_15']
test_x['SNP_total'] = test_x['SNP_01'] + '-' + test_x['SNP_02'] + '-' + test_x['SNP_03'] + '-' + test_x['SNP_04'] + '-' + test_x['SNP_05'] + '-' + test_x['SNP_06'] + '-' + test_x['SNP_07'] + '-' + test_x['SNP_08'] + '-' + test_x['SNP_09'] + '-' + test_x['SNP_10'] + '-' + test_x['SNP_11'] + '-' + test_x['SNP_12'] + '-' + test_x['SNP_13'] + '-' + test_x['SNP_14'] + '-' + test_x['SNP_15']

In [None]:
train_x['2_BTA'] = train_x['2_BTA'].apply(lambda x : x.replace(" ",""))
train_x['chrom_6'] = train_x['chrom_6'].apply(lambda x : x.replace(" ",""))
train_x['6_ARS_Parent'] = train_x['6_ARS_Parent'].apply(lambda x : x.replace(" ",""))
train_x['6_ARS_BFGL'] = train_x['6_ARS_BFGL'].apply(lambda x : x.replace(" ",""))
train_x['6_BOVINE'] = train_x['6_BOVINE'].apply(lambda x : x.replace(" ",""))
train_x['6_HAPMAP'] = train_x['6_HAPMAP'].apply(lambda x : x.replace(" ",""))
train_x['7_BTB'] = train_x['7_BTB'].apply(lambda x : x.replace(" ",""))
train_x['8_ARS'] = train_x['8_ARS'].apply(lambda x : x.replace(" ",""))
train_x['chrom_9'] = train_x['chrom_9'].apply(lambda x : x.replace(" ",""))
train_x['9_HAPMAP'] = train_x['9_HAPMAP'].apply(lambda x : x.replace(" ",""))
train_x['9_BTB'] = train_x['9_BTB'].apply(lambda x : x.replace(" ",""))
train_x['10_BOVINE'] = train_x['10_BOVINE'].apply(lambda x : x.replace(" ",""))
train_x['SNP_total'] = train_x['SNP_total'].apply(lambda x : x.replace(" ",""))

In [None]:
test_x['2_BTA'] = test_x['2_BTA'].apply(lambda x : x.replace(" ",""))
test_x['chrom_6'] = test_x['chrom_6'].apply(lambda x : x.replace(" ",""))
test_x['6_ARS_Parent'] = test_x['6_ARS_Parent'].apply(lambda x : x.replace(" ",""))
test_x['6_ARS_BFGL'] = test_x['6_ARS_BFGL'].apply(lambda x : x.replace(" ",""))
test_x['6_BOVINE'] = test_x['6_BOVINE'].apply(lambda x : x.replace(" ",""))
test_x['6_HAPMAP'] = test_x['6_HAPMAP'].apply(lambda x : x.replace(" ",""))
test_x['7_BTB'] = test_x['7_BTB'].apply(lambda x : x.replace(" ",""))
test_x['8_ARS'] = test_x['8_ARS'].apply(lambda x : x.replace(" ",""))
test_x['chrom_9'] = test_x['chrom_9'].apply(lambda x : x.replace(" ",""))
test_x['9_HAPMAP'] = test_x['9_HAPMAP'].apply(lambda x : x.replace(" ",""))
test_x['9_BTB'] = test_x['9_BTB'].apply(lambda x : x.replace(" ",""))
test_x['10_BOVINE'] = test_x['10_BOVINE'].apply(lambda x : x.replace(" ",""))
test_x['SNP_total'] = test_x['SNP_total'].apply(lambda x : x.replace(" ",""))

In [None]:
train_x.columns = list(pd.RangeIndex(len(train_x.columns)))
test_x.columns = list(pd.RangeIndex(len(test_x.columns)))

In [None]:
data =[]
for i in range(0,len(train_x)):
        p= train_x[1][i]+train_x[2][i]+train_x[3][i]+train_x[4][i]+train_x[5][i]+train_x[6][i]+train_x[7][i]+train_x[8][i]+train_x[9][i]+train_x[10][i]+train_x[11][i]+train_x[12][i]+train_x[13][i]+train_x[14][i]+train_x[15][i]
        p = p.replace(" ","")
        data.append(p)

train_x['concat'] = data

numGC =[0]*len(train_x)
numA =[0]*len(train_x)
sub =[0]*len(train_x)
H = [0]*len(train_x)

for i in range(0,len(train_x)):
    for j in range(0,30):
        if train_x['concat'][i][j] != 'A':
            numGC[i] = numGC[i]+1
        else:
            numA[i] = numA[i]+1
train_x['numGC'] = numGC
train_x['numA'] = numA

for i in range(0,len(train_x)):
    sub[i] = numGC[i]-numA[i]

for i in range(0,len(train_x)):
    H[i] = numGC[i]*3+numA[i]*2

train_x['sub'] = sub
train_x['H'] = H

In [None]:
data =[]
for i in range(0,len(test_x)):
        p= test_x[1][i]+test_x[2][i]+test_x[3][i]+test_x[4][i]+test_x[5][i]+test_x[6][i]+test_x[7][i]+test_x[8][i]+test_x[9][i]+test_x[10][i]+test_x[11][i]+test_x[12][i]+test_x[13][i]+test_x[14][i]+test_x[15][i]
        p = p.replace(" ","")
        data.append(p)

test_x['concat'] = data

numGC =[0]*len(test_x)
numA =[0]*len(test_x)
sub =[0]*len(test_x)
H = [0]*len(test_x)

for i in range(0,len(test_x)):
    for j in range(0,30):
        if test_x['concat'][i][j] != 'A':
            numGC[i] = numGC[i]+1
        else:
            numA[i] = numA[i]+1
test_x['numGC'] = numGC
test_x['numA'] = numA

for i in range(0,len(test_x)):
    sub[i] = numGC[i]-numA[i]

for i in range(0,len(test_x)):
    H[i] = numGC[i]*3+numA[i]*2

test_x['sub'] = sub
test_x['H'] = H

In [None]:
train_x[0] = train_x[0].astype('object')
test_x[0] = test_x[0].astype('object')

In [None]:
X_train_02 = train_x.copy()
X_test_02 = test_x.copy()

## Feature_set_03

In [None]:
train = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/test.csv')
snp = pd.read_csv('/content/drive/MyDrive/유전체공모전/data/snp_info.csv')

In [None]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [None]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

In [None]:
train_x.drop(columns=['father','mother','gender'],inplace=True)
test_x.drop(columns=['father','mother','gender'],inplace=True)

In [None]:
#SNP combine(chrom)
train_x['chrom_2'] = train_x['SNP_01']
train_x['chrom_6'] = train_x['SNP_05'] + '-' + train_x['SNP_06'] + '-' + train_x['SNP_07'] + '-' + train_x['SNP_08'] + '-' + train_x['SNP_09']
train_x['chrom_7'] = train_x['SNP_10']
train_x['chrom_8'] = train_x['SNP_11']
train_x['chrom_9'] = train_x['SNP_12'] + '-' + train_x['SNP_13'] + '-' + train_x['SNP_14']
train_x['chrom_10'] = train_x['SNP_15']
train_x['SNP_total'] = train_x['SNP_01'] + '-' + train_x['SNP_02'] + '-' + train_x['SNP_03'] + '-' + train_x['SNP_04'] + '-' + train_x['SNP_05'] + '-' + train_x['SNP_06'] + '-' + train_x['SNP_07'] + '-' + train_x['SNP_08'] + '-' + train_x['SNP_09'] + '-' + train_x['SNP_10'] + '-' + train_x['SNP_11'] + '-' + train_x['SNP_12'] + '-' + train_x['SNP_13'] + '-' + train_x['SNP_14'] + '-' + train_x['SNP_15']

test_x['chrom_2'] = test_x['SNP_01']
test_x['chrom_6'] = test_x['SNP_05'] + '-' + test_x['SNP_06'] + '-' + test_x['SNP_07'] + '-' + test_x['SNP_08'] + '-' + test_x['SNP_09']
test_x['chrom_7'] = test_x['SNP_10']
test_x['chrom_8'] = test_x['SNP_11']
test_x['chrom_9'] = test_x['SNP_12'] + '-' + test_x['SNP_13'] + '-' + test_x['SNP_14']
test_x['chrom_10'] = test_x['SNP_15']
test_x['SNP_total'] = test_x['SNP_01'] + '-' + test_x['SNP_02'] + '-' + test_x['SNP_03'] + '-' + test_x['SNP_04'] + '-' + test_x['SNP_05'] + '-' + test_x['SNP_06'] + '-' + test_x['SNP_07'] + '-' + test_x['SNP_08'] + '-' + test_x['SNP_09'] + '-' + test_x['SNP_10'] + '-' + test_x['SNP_11'] + '-' + test_x['SNP_12'] + '-' + test_x['SNP_13'] + '-' + test_x['SNP_14'] + '-' + test_x['SNP_15']

In [None]:
#SNP combine(cm)
train_x['cm_60'] = train_x['SNP_01'] + '-' + train_x['SNP_03'] + '-' + train_x['SNP_10'] + '-' + train_x['SNP_12'] + '-' + train_x['SNP_13'] + '-' + train_x['SNP_14']

test_x['cm_60'] = test_x['SNP_01'] + '-' + test_x['SNP_03'] + '-' + test_x['SNP_10'] + '-' + test_x['SNP_12'] + '-' + test_x['SNP_13'] + '-' + test_x['SNP_14']

In [None]:
#SNP combine(pos)
train_x['pos_4'] = train_x['SNP_01'] + '-' + train_x['SNP_03'] + '-' + train_x['SNP_10']
train_x['pos_5'] = train_x['SNP_04'] + '-' + train_x['SNP_12'] + '-' + train_x['SNP_13']
train_x['pos_6'] = train_x['SNP_05'] + '-' + train_x['SNP_06'] + '-' + train_x['SNP_07'] + '-' + train_x['SNP_08']
train_x['pos_7'] = train_x['SNP_09'] + '-' + train_x['SNP_14']

test_x['pos_4'] = test_x['SNP_01'] + '-' + test_x['SNP_03'] + '-' + test_x['SNP_10']
test_x['pos_5'] = test_x['SNP_04'] + '-' + test_x['SNP_12'] + '-' + test_x['SNP_13']
test_x['pos_6'] = test_x['SNP_05'] + '-' + test_x['SNP_06'] + '-' + test_x['SNP_07'] + '-' + test_x['SNP_08']
test_x['pos_7'] = test_x['SNP_09'] + '-' + test_x['SNP_14']

In [None]:
train_x['chrom_6'] = train_x['chrom_6'].apply(lambda x : x.replace(" ",""))
train_x['chrom_9'] = train_x['chrom_9'].apply(lambda x : x.replace(" ",""))
train_x['SNP_total'] = train_x['SNP_total'].apply(lambda x : x.replace(" ",""))

train_x['cm_60'] = train_x['cm_60'].apply(lambda x : x.replace(" ",""))

train_x['pos_4'] = train_x['pos_4'].apply(lambda x : x.replace(" ",""))
train_x['pos_5'] = train_x['pos_5'].apply(lambda x : x.replace(" ",""))
train_x['pos_6'] = train_x['pos_6'].apply(lambda x : x.replace(" ",""))
train_x['pos_7'] = train_x['pos_7'].apply(lambda x : x.replace(" ",""))

test_x['chrom_6'] = test_x['chrom_6'].apply(lambda x : x.replace(" ",""))
test_x['chrom_9'] = test_x['chrom_9'].apply(lambda x : x.replace(" ",""))
test_x['SNP_total'] = test_x['SNP_total'].apply(lambda x : x.replace(" ",""))

test_x['cm_60'] = test_x['cm_60'].apply(lambda x : x.replace(" ",""))

test_x['pos_4'] = test_x['pos_4'].apply(lambda x : x.replace(" ",""))
test_x['pos_5'] = test_x['pos_5'].apply(lambda x : x.replace(" ",""))
test_x['pos_6'] = test_x['pos_6'].apply(lambda x : x.replace(" ",""))
test_x['pos_7'] = test_x['pos_7'].apply(lambda x : x.replace(" ",""))

In [None]:
train_x.drop(columns=['SNP_01','SNP_10','SNP_11','SNP_15'],inplace=True)
test_x.drop(columns=['SNP_01','SNP_10','SNP_11','SNP_15'],inplace=True)

In [None]:
train_x['trait'] = train_x['trait'].astype(str)
test_x['trait'] = test_x['trait'].astype(str)

In [None]:
X_train_03 = train_x.copy()
X_test_03 = test_x.copy()

In [None]:
X_train = pd.concat([X_train_01,X_train_02,X_train_03],axis=1)
X_test = pd.concat([X_test_01,X_test_02,X_test_03],axis=1)

In [None]:
df_all = pd.concat([X_train,X_test])

In [None]:
df_all.columns = list(pd.RangeIndex(len(df_all.columns)))
df_all = df_all.loc[:,~df_all.T.duplicated()]
df_all.columns = list(pd.RangeIndex(len(df_all.columns)))
X_train = df_all.iloc[:262,:]
X_test = df_all.iloc[262:,:]

In [None]:
Cat_list = X_train.select_dtypes(include='object').columns.to_list()
Num_list = X_train.select_dtypes(exclude='object').columns.to_list()

In [None]:
X_train.shape, X_test.shape, y_train.shape

((262, 85), (175, 85), (262,))

### Scaling

In [None]:
stscaler = StandardScaler()
X_train[Num_list] = stscaler.fit_transform(X_train[Num_list])
X_test[Num_list] = stscaler.transform(X_test[Num_list])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [None]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,75,76,77,78,79,80,81,82,83,84
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,...,GG-AG-AA-GA-CA-AA-AA-GG-AA-GG-AG-AA-AA-AA-AA,GGAGAAGACAAAAAGGAAGGAGAAAAAAAA,-0.986104,-0.986104,CA-AA-AA-GG-AA,GG-AA-GG-AA-AA-AA,GG-AA-GG,GA-AA-AA,CA-AA-AA-GG,AA-AA
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,...,AG-AG-CA-AA-AA-AG-AA-GA-AA-AG-AA-GA-GG-AA-AA,AGAGCAAAAAAGAAGAAAAGAAGAGGAAAA,-1.270327,-1.270327,AA-AG-AA-GA-AA,AG-CA-AG-GA-GG-AA,AG-CA-AG,AA-GA-GG,AA-AG-AA-GA,AA-AA
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,...,GG-GG-AA-GA-CC-GG-AA-GA-GA-AG-AA-AA-AA-AA-AA,GGGGAAGACCGGAAGAGAAGAAAAAAAAAA,-0.417657,-0.417657,CC-GG-AA-GA-GA,GG-AA-AG-AA-AA-AA,GG-AA-AG,GA-AA-AA,CC-GG-AA-GA,GA-AA
3,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,...,AA-GG-AA-GA-AA-GG-GG-AA-GG-AG-GG-GG-GG-AA-GG,AAGGAAGAAAGGGGAAGGAGGGGGGGAAGG,1.287685,1.287685,AA-GG-GG-AA-GG,AA-AA-AG-GG-GG-AA,AA-AA-AG,GA-GG-GG,AA-GG-GG-AA,GG-AA
4,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,...,GG-GG-CC-AA-CC-AA-AA-AA-AA-GG-AA-AA-AG-AA-GA,GGGGCCAACCAAAAAAAAGGAAAAAGAAGA,-0.417657,-0.417657,CC-AA-AA-AA-AA,GG-CC-GG-AA-AG-AA,GG-CC-GG,AA-AA-AG,CC-AA-AA-AA,AA-AA


### Encoding

In [None]:
def catboost_encode_multiclass(X,X_t,y):
    y=y.astype(str)
    enc=ce.OneHotEncoder().fit(y)
    y_onehot=enc.transform(y)
    class_names=y_onehot.columns
    X_obj=X.select_dtypes('object')
    X_t_obj=X_t.select_dtypes('object')
    X=X.select_dtypes(exclude='object')
    X_t=X_t.select_dtypes(exclude='object')
    for class_ in class_names:
        enc=ce.CatBoostEncoder()
        enc.fit(X_obj,y_onehot[class_])
        temp=enc.transform(X_obj)
        temp_t=enc.transform(X_t_obj)
        temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
        temp_t.columns=[str(x)+'_'+str(class_) for x in temp_t.columns]
        X=pd.concat([X,temp],axis=1)
        X_t=pd.concat([X_t,temp_t],axis=1)

    return X, X_t

In [None]:
X_train, X_test = catboost_encode_multiclass(X_train,X_test,y_train)

In [None]:
X_train.shape, X_test.shape

((262, 239), (175, 239))

In [None]:
X_train.rename(columns = {56:'Num_feature_1', 57:'Num_feature_2', 58:'Num_feature_3', 59:'Num_feature_4', 60:'Num_feature_5', 61:'Num_feature_6', 77:'Num_feature_7', 78:'Num_feature_8'}, inplace=True)
X_test.rename(columns = {56:'Num_feature_1', 57:'Num_feature_2', 58:'Num_feature_3', 59:'Num_feature_4', 60:'Num_feature_5', 61:'Num_feature_6', 77:'Num_feature_7', 78:'Num_feature_8'}, inplace=True)

### Clustering

In [None]:
for i in range(3,7):
  clust = KMeans(n_clusters=i, random_state=CFG.SEED)
  clust.fit(X_train.iloc[:,:239])
  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:239])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:239])

  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:239])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:239])
  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:239])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:239])
  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:239])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:239])
  X_train[f'clust_{i}'] = clust.predict(X_train.iloc[:,:239])
  X_test[f'clust_{i}'] = clust.predict(X_test.iloc[:,:239])


In [None]:
X_train.shape, X_test.shape, y_train.shape

((262, 243), (175, 243), (262,))

### OverSampling

In [None]:
X_train, y_train = SMOTE(random_state=CFG.SEED).fit_resample(X_train, y_train)

In [None]:
X_train.shape, X_test.shape, y_train.shape

((342, 243), (175, 243), (342,))

## Submission

In [None]:
# X_train.to_csv('/content/drive/MyDrive/유전체공모전/data/X_train_fine.csv', index=False)
# X_test.to_csv('/content/drive/MyDrive/유전체공모전/data/X_test_fine.csv', index=False)
# y_train.to_csv('/content/drive/MyDrive/유전체공모전/data/y_train_fine.csv', index=False)