In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plt.rc('font', family='malgun gothic')
# plt.rc('axes', unicode_minus=False)
import seaborn as sns
sns.set_theme(style='whitegrid', font_scale=1.5)
sns.set_palette('Set2', n_colors=10)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder, BinaryEncoder
from sklearn.feature_selection import SelectFromModel, SelectPercentile, SelectKBest, RFE
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate
import shap
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, LeaveOneOut, ShuffleSplit
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix,f1_score, ConfusionMatrixDisplay, average_precision_score, PrecisionRecallDisplay, precision_recall_curve, roc_curve, auc, RocCurveDisplay
from sklearn import set_config
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPClassifier, MLPRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, BaggingRegressor
from sklearn.ensemble import VotingClassifier, StackingClassifier
from vecstack import stacking
import shap
from sklearn.decomposition import PCA, TruncatedSVD
import optuna
from optuna.distributions import CategoricalDistribution, IntDistribution, FloatDistribution
from optuna.integration import OptunaSearchCV, ShapleyImportanceEvaluator
from optuna.integration.lightgbm import LightGBMTunerCV
import itertools
import re
import random
import tensorflow as tf
from tensorflow import keras


In [3]:
SEED = 0

In [4]:
X_train = pd.read_csv('train.csv', encoding = 'UTF-8').drop(columns=['id','class'])
y_train = pd.read_csv('train.csv', encoding = 'UTF-8')['class']
X_test = pd.read_csv('test.csv', encoding = 'UTF-8')
test_id = X_test.id
X_test = X_test.drop(columns='id')
snp_info = pd.read_csv('snp_info.csv', encoding = 'UTF-8')

In [5]:
#X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=SEED)

In [6]:
# cls = {'A': 0,
#        'B': 1,
#        'C': 2}
# y_train = y_train.apply(lambda x: cls[x])

In [7]:
y_train.value_counts()

B    114
C     79
A     69
Name: class, dtype: int64

In [8]:
snp_info
#name : SNP 명
#chrom : 염색체 정보 염색체 번호
#cm : Genetic distance
#pos : 각 마커의 유전체상 위치 정보

Unnamed: 0,SNP_id,name,chrom,cm,pos
0,SNP_01,BTA-19852-no-rs,2,67.0546,42986890
1,SNP_02,ARS-USMARC-Parent-DQ647190-rs29013632,6,31.1567,13897068
2,SNP_03,ARS-BFGL-NGS-117009,6,68.2892,44649549
3,SNP_04,ARS-BFGL-NGS-60567,6,77.8749,53826064
4,SNP_05,BovineHD0600017032,6,80.5015,61779512
5,SNP_06,BovineHD0600017424,6,80.5954,63048481
6,SNP_07,Hapmap49442-BTA-111073,6,80.78,64037334
7,SNP_08,BovineHD0600018638,6,82.6856,67510588
8,SNP_09,ARS-BFGL-NGS-37727,6,86.874,73092782
9,SNP_10,BTB-01558306,7,62.0692,40827112


# feature 생성

In [9]:
X_train = X_train.drop(columns = ['father','mother','gender'])
X_test = X_test.drop(columns = ['father','mother','gender'])

In [10]:
def feature_generation(x):
    x['BTA'] = x['SNP_01'] + x['SNP_07'] + x['SNP_12'] + x['SNP_14']
    x['Hapmap-BTA'] = x['SNP_07'] + x['SNP_12'] + x['SNP_14']
    x['Hapmap-BTA_chrom9'] = x['SNP_12'] + x['SNP_14']
    x['ARS'] = x['SNP_02'] + x['SNP_03'] + x['SNP_04'] + x['SNP_09'] + x['SNP_11']
    x['ARS-BFGL-NGS'] = x['SNP_03'] + x['SNP_04'] + x['SNP_09'] + x['SNP_11']
    x['ARS-BFGL-NGS_chrom6'] = x['SNP_03'] + x['SNP_04'] + x['SNP_09']
    x['BovineHD'] = x['SNP_05'] + x['SNP_06'] + x['SNP_08'] + x['SNP_15']
    x['BovineHD06'] = x['SNP_05'] + x['SNP_06'] + x['SNP_08']
    x['BTB'] = x['SNP_10'] + x['SNP_13']

    x['chrom6'] = x['SNP_02'] + x['SNP_03'] + x['SNP_04'] + x['SNP_05'] + x['SNP_06'] + x['SNP_07'] + x['SNP_08'] + x['SNP_09']
    x['chrom9'] = x['SNP_12'] + x['SNP_13'] + x['SNP_14']
    x['total'] = x['SNP_01'] + x['SNP_02'] + x['SNP_03'] + x['SNP_04'] + x['SNP_05'] + x['SNP_06'] + x['SNP_07'] + x['SNP_08'] + x['SNP_09'] + x['SNP_10'] + x['SNP_11'] + x['SNP_12'] + x['SNP_13'] + x['SNP_14'] + x['SNP_15']
    x['one_chrom'] = x['SNP_01'] + x['SNP_10'] + x['SNP_11'] + x['SNP_15']
    print('feature_generation 완료')

In [11]:
feature_generation(X_train)
feature_generation(X_test)

feature_generation 완료
feature_generation 완료


In [12]:
X_train

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,...,ARS,ARS-BFGL-NGS,ARS-BFGL-NGS_chrom6,BovineHD,BovineHD06,BTB,chrom6,chrom9,total,one_chrom
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,...,A GA AG AA AA G,A AG AA AA G,A AG AA A,C AA AG GA A,C AA AG G,G GA A,A GA AG AC AA AA AG GA A,A AA AA A,G GA GA AG AC AA AA AG GA AG GA GA AA AA AA A,G GG GA GA A
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,...,A GC AA AA AA A,C AA AA AA A,C AA AA A,A AA GG AA A,A AA GG A,A GG G,A GC AA AA AA GA AG AA A,G AG GA A,A GA GC AA AA AA GA AG AA AA GA AG AG GA AA A,A GA GA AA A
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,...,G GA AG AG AA A,A AG AG AA A,A AG AG A,C CG GG AA A,C CG GG A,A GA A,G GA AG AC CG GA AG AG A,A AA AA A,G GG GA AG AC CG GA AG AG AA GA AA AA AA AA A,G GA GA AA A
3,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,...,G GA AG AG GG G,A AG AG GG G,A AG AG G,A AG GA AG G,A AG GA A,A GG G,G GA AG AA AG GG GA AG G,G GG GA A,A AG GA AG AA AG GG GA AG GA GG GG GG GA AG G,A AA GG GG G
4,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,...,G GC CA AA AA A,C CA AA AA A,C CA AA A,C CA AA AG A,C CA AA A,G GA G,G GC CA AC CA AA AA AA A,A AA GA A,G GG GC CA AC CA AA AA AA AG GA AA AA GA AG A,G GG GA AG A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,2,A G,A G,A A,G A,C C,A G,A A,G A,A A,...,A GA AG AA AA G,A AG AA AA G,A AG AA A,C CA GG AA A,C CA GG A,G GA A,A GA AG AC CA GA AG AA A,G AA AA A,A GA GA AG AC CA GA AG AA AG GA GG AA AA AA A,A GG GA GA A
258,2,G G,A A,C A,A A,A A,A G,G A,G A,A A,...,A AC AA AA AA G,C AA AA AA G,C AA AA A,A AA GG AG A,A AA GG A,A GA G,A AC AA AA AA GG AG AA A,A AA GA A,G GA AC AA AA AA GG AG AA AA GA GA AA GA AG A,G GA GA GG A
259,1,A G,G G,A A,G A,A A,A G,G G,G A,G A,...,G GA AG AG AG G,A AG AG AG G,A AG AG A,A AA GG AG G,A AA GG A,A AG G,G GA AG AA AA GG GG AG A,G GG GC A,A GG GA AG AA AA GG GG AG AA AG GG GG GC AG G,A GA AG GG G
260,1,A A,G G,A A,G A,A A,G G,G G,A A,G A,...,G GA AG AG AA G,A AG AG AA G,A AG AG A,A AG GA AG G,A AG GA A,A GG G,G GA AG AA AG GG GA AG A,G AG GC A,A AG GA AG AA AG GG GA AG AA GA GG AG GC AG G,A AA GA GG G


In [13]:
cat = X_train.columns.to_list()

In [14]:
scores = []  # CV 결과 저장
oof_pred1 = []  # OOF 저장
oof_pred2 = []
oof_pred3 = []
oof_pred4 = []
oof_pred5 = []

kfold = KFold(n_splits=5, shuffle=True, random_state=0) # K-Folds cross-validator
n = 0
for train_index, valid_index in kfold.split(X_train, y_train): 
    train_x, valid_x = X_train.iloc[train_index], X_train.iloc[valid_index]
    train_y, valid_y = y_train.iloc[train_index], y_train.iloc[valid_index]

    #낮은 Cardinality를 가지는 것은 원핫으로, 나머지는 타겟인코딩으로 해서 캣부스트
    model = CatBoostClassifier(cat_features=cat, one_hot_max_size = 4, verbose=False, random_state=0)
    model.fit(train_x, train_y,
              eval_set=[(valid_x,valid_y)],
              early_stopping_rounds=100,
             )
    # CV 스코어 계산 및 저장
    y_pred = model.predict(valid_x)
    y_true = valid_y
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    scores.append(f1)

    # OOF 예측값 저장
    n += 1
    globals()['oof_pred'+str(n)] = model.predict(X_test)

In [15]:
oof_pred1 = list(itertools.chain.from_iterable(oof_pred1))
oof_pred2 = list(itertools.chain.from_iterable(oof_pred2))
oof_pred3 = list(itertools.chain.from_iterable(oof_pred3))
oof_pred4 = list(itertools.chain.from_iterable(oof_pred4))
oof_pred5 = list(itertools.chain.from_iterable(oof_pred5))

In [16]:
from collections import Counter

In [17]:
def modefinder(lis):   #numbers는 리스트나 튜플 형태의 데이터
    c = Counter(lis)
    mode = c.most_common(1)
    return mode[0][0]


In [18]:
most_common = []
for a,b,c,d,e in zip(oof_pred1,oof_pred2,oof_pred3,oof_pred4,oof_pred5):
    most_common.append(modefinder([a,b,c,d,e]))

In [19]:
scores = np.array(scores) 
print("CV scores: ", scores)
print("CV mean = %.2f" % scores.mean(), "with std = %.2f" % scores.std())

CV scores:  [1.         0.91691996 0.91395859 0.96328502 0.98306878]
CV mean = 0.96 with std = 0.03


In [20]:
# submission 화일 생성
filename = 'final_sub.csv'
pd.DataFrame({'id':test_id, 'class':most_common}).to_csv(filename, index=False)

In [None]:
#성능 0.98142