# Toxic Comments Classification

In [16]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [17]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## 1. 데이터 로드

In [18]:
df = pd.read_csv('./data/train.csv')
df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [20]:
df.shape

(159571, 8)

## 2. 필요한 set과 객체 선언

In [21]:
# 축약어 모음
APPOS = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "i would",
"i'd" : "i had",
"i'll" : "i will",
"i'm" : "i am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "i have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}
# 불용어 모음
STOPWORDS = set(stopwords.words("english"))

lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()
analyzer = SentimentIntensityAnalyzer()

## 3. 함수 정의

In [22]:
# 여러 분류의 toxic feature를 istoxic으로 변환하는 함수
def cat_istoxic(count):
    if count < 1:
        return 0
    else:
        return 1    
    
def get_istoxic(df):
    df['sum_of_toxic'] = df['toxic'] + df['severe_toxic'] + df['obscene'] + df['threat'] + df['insult'] + df['identity_hate']
    df['is_toxic'] = df['sum_of_toxic'].apply(lambda x: cat_istoxic(x))
    df = df.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'sum_of_toxic'], axis=1)

    return df

In [23]:
# 단어, 문장 등을 count하여 feature로 넣는 함수 
def get_features_count(df):
    df['count_of_sent'] = df['comment_text'].apply(lambda x: len(re.findall('\n', str(x)))+1)
    df['count_of_word'] = df['comment_text'].apply(lambda x: len(str(x).split()))
    df['count_of_unique_word'] = df['comment_text'].apply(lambda x: len(set(str(x).split())))
    df['count_of_punctuations'] = df['comment_text'].apply(lambda x: len([s for s in str(x) if s in string.punctuation]))
    df['count_of_upper_words'] = df['comment_text'].apply(lambda x: len([s for s in str(x).split() if s.isupper()]))
    df['count_of_stopwords'] = df['comment_text'].apply(lambda x: len([s for s in str(x).lower().split() if s in STOPWORDS]))

    df['unique_word_percent'] = df['count_of_unique_word'] * 100 / df['count_of_word']
    df['punct_percent'] = df['count_of_punctuations'] * 100 / df['count_of_word']

    return df

In [24]:
# 쓸모없는 정보(개행, IP주소, USERNAME)을 제거하는 함수
def clean_useless(comment):
    comment = comment.lower()
    
    # 개행 제거
    comment = re.sub('\\n', " ", comment)
    # IP주소 제거
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    # USERNAME 제거
    comment = re.sub("\[\[.*\]","",comment)
    
    # 토큰화
    words = tokenizer.tokenize(comment)
    
    # 줄임말 풀이
    words = [APPOS[word] if word in APPOS else word for word in words]
    # 줄임말 다시 분리
    sent = " ".join(words)
    words = tokenizer.tokenize(sent)
    # stemming
    words = [lemmatizer.lemmatize(word, "v") for word in words]
    # 불용어 제거
    words = [w for w in words if not w in STOPWORDS]
    
    
    clean_sent = " ".join(words)
    
    return(clean_sent)

In [25]:
# 위의 함수를 적용하여 comment를 clean하게 만드는 함수
def get_clean_comment(df):
    df['comment_text'] = df['comment_text'].apply(lambda x: clean_useless(x))
    
    return df

In [26]:
# 감정분석을 하는 함수
def vader_polarity(sentence, threshold=0.1):
    scores = analyzer.polarity_scores(sentence)
    
    # compound 값에 기반하여 threshold 입력값보다 크면 1, 그렇지 않으면 0을 반환 
    # compound 가 0.1 보다 크면 긍정 그렇지 않으면 부정
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    
    return final_sentiment

In [27]:
# 감정분석을 하여 sent_scores라는 피처를 추가하는 함수 
def get_sent_scores(df):
    df['sent_scores'] = df['comment_text'].apply(lambda x: vader_polarity(x))
    
    return df

In [28]:
# 전처리 종합 함수
def preprocessing(df):
    get_istoxic(df)
    get_features_count(df)
    get_clean_comment(df)
    get_sent_scores(df)
    
    df = df.drop(['id', 'comment_text'], axis=1)
    
    return df

## 4. 데이터 전처리

In [29]:
df = preprocessing(df)
df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,sum_of_toxic,is_toxic,count_of_sent,count_of_word,count_of_unique_word,count_of_punctuations,count_of_upper_words,count_of_stopwords,unique_word_percent,punct_percent,sent_scores
0,0,0,0,0,0,0,0,0,2,43,41,10,2,18,95.348837,23.255814,1
1,0,0,0,0,0,0,0,0,1,17,17,12,1,2,100.000000,70.588235,1
2,0,0,0,0,0,0,0,0,1,42,39,6,0,20,92.857143,14.285714,0
3,0,0,0,0,0,0,0,0,5,113,82,21,5,56,72.566372,18.584071,1
4,0,0,0,0,0,0,0,0,1,13,13,5,0,5,100.000000,38.461538,1
5,0,0,0,0,0,0,0,0,3,13,12,4,0,4,92.307692,30.769231,1
6,1,1,1,0,1,0,4,1,1,8,8,0,8,4,100.000000,0.000000,0
7,0,0,0,0,0,0,0,0,1,20,20,4,0,12,100.000000,20.000000,0
8,0,0,0,0,0,0,0,0,1,83,70,19,1,42,84.337349,22.891566,0
9,0,0,0,0,0,0,0,0,1,12,12,0,0,8,100.000000,0.000000,0


## 5. 모델링 및 학습

In [30]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

# feature와 label로 분리
y_labels = df['is_toxic']
X_features = df.drop(['is_toxic'], axis=1)
# train set과 validation set으로 분리
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=0)

# 하이퍼 파라미터 설정
params = {
    'num_leaves' : [32, 64],
    'max_depth' : [128, 160],
    'min_child_samples' : [60, 100],
    'subsample' : [0.8, 1]
}

# 모델링
lgbm_clf = LGBMClassifier(n_estimators=500)
gridcv = GridSearchCV(lgbm_clf, param_grid=params)

# 학습
gridcv.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc", eval_set=[(X_train, y_train), (X_test, y_test)])

# 검증 : 평가지표 ROC-AUC
print(gridcv.best_params_)
lgbm_roc_score = roc_auc_score(y_test, gridcv.predict_proba(X_test)[:, -1], average="macro")
print('ROC AUC : {0:4f}'.format(lgbm_roc_score))

[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234046	valid_1's auc: 1	valid_1's binary_logloss: 0.237247
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193767	valid_1's auc: 1	valid_1's binary_logloss: 0.196256
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165421	valid_1's auc: 1	valid_1's binary_logloss: 0.167453
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.143431	valid_1's auc: 1	valid_1's binary_logloss: 0.145131
[5]	valid_0's auc: 1	valid_0's binary_logloss: 0.125556	valid_1's auc: 1	valid_1's binary_logloss: 0.127002
[6]	valid_0's auc: 1	valid_0's binary_logloss: 0.110625	valid_1's auc: 1	valid_1's binary_logloss: 0.111869
[7]	valid_0's auc: 1	valid_0's binary_logloss: 0.0979298	valid_1's auc: 1	valid_1's binary_logloss: 0.0990084
[8]	valid_0's auc: 1	valid_0's binary_logloss: 0.0870008	valid_1's auc: 1	valid_1's binary_logloss: 0.0879423
[9]	valid_0's auc: 1	valid_0's binary_logloss: 0.0775069	valid_1's auc:

[16]	valid_0's auc: 1	valid_0's binary_logloss: 0.0361054	valid_1's auc: 1	valid_1's binary_logloss: 0.0364659
[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324969	valid_1's auc: 1	valid_1's binary_logloss: 0.0328196
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.0292664	valid_1's auc: 1	valid_1's binary_logloss: 0.0295556
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263706	valid_1's auc: 1	valid_1's binary_logloss: 0.02663
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.0237721	valid_1's auc: 1	valid_1's binary_logloss: 0.024005
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.0214382	valid_1's auc: 1	valid_1's binary_logloss: 0.0216475
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193402	valid_1's auc: 1	valid_1's binary_logloss: 0.0195284
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.017453	valid_1's auc: 1	valid_1's binary_logloss: 0.0176223
[24]	valid_0's auc: 1	valid_0's binary_logloss: 0.0157543	valid_1's auc: 1	valid_1's binary_logloss: 0.0159068
[25]	

[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234054	valid_1's auc: 1	valid_1's binary_logloss: 0.237255
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193774	valid_1's auc: 1	valid_1's binary_logloss: 0.196263
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165428	valid_1's auc: 1	valid_1's binary_logloss: 0.167459
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.143437	valid_1's auc: 1	valid_1's binary_logloss: 0.145137
[5]	valid_0's auc: 1	valid_0's binary_logloss: 0.125562	valid_1's auc: 1	valid_1's binary_logloss: 0.127007
[6]	valid_0's auc: 1	valid_0's binary_logloss: 0.11063	valid_1's auc: 1	valid_1's binary_logloss: 0.111874
[7]	valid_0's auc: 1	valid_0's binary_logloss: 0.097934	valid_1's auc: 1	valid_1's binary_logloss: 0.0990126
[8]	valid_0's auc: 1	valid_0's binary_logloss: 0.0870046	valid_1's auc: 1	valid_1's binary_logloss: 0.0879461
[9]	valid_0's auc: 1	valid_0's binary_logloss: 0.0775103	valid_1's auc: 1

[16]	valid_0's auc: 1	valid_0's binary_logloss: 0.0361035	valid_1's auc: 1	valid_1's binary_logloss: 0.036464
[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324952	valid_1's auc: 1	valid_1's binary_logloss: 0.0328179
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.0292648	valid_1's auc: 1	valid_1's binary_logloss: 0.029554
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263692	valid_1's auc: 1	valid_1's binary_logloss: 0.0266286
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.0237708	valid_1's auc: 1	valid_1's binary_logloss: 0.0240037
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.021437	valid_1's auc: 1	valid_1's binary_logloss: 0.0216463
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193392	valid_1's auc: 1	valid_1's binary_logloss: 0.0195274
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.0174521	valid_1's auc: 1	valid_1's binary_logloss: 0.0176214
[24]	valid_0's auc: 1	valid_0's binary_logloss: 0.0157534	valid_1's auc: 1	valid_1's binary_logloss: 0.0159059
[25]

[29]	valid_0's auc: 1	valid_0's binary_logloss: 0.00947132	valid_1's auc: 1	valid_1's binary_logloss: 0.00956212
[30]	valid_0's auc: 1	valid_0's binary_logloss: 0.00855915	valid_1's auc: 1	valid_1's binary_logloss: 0.00864109
[31]	valid_0's auc: 1	valid_0's binary_logloss: 0.0077358	valid_1's auc: 1	valid_1's binary_logloss: 0.00780977
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234046	valid_1's auc: 1	valid_1's binary_logloss: 0.237247
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234045	valid_1's auc: 1	valid_1's binary_logloss: 0.237246
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193766	valid_1's auc: 1	valid_1's binary_logloss: 0.196255
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165421	valid_1's auc: 1	valid_1's binary_logloss: 0.167452
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.14343	valid_1's auc: 1	valid_1's binary_logloss: 0.14513
[5]	valid_0's auc: 1	valid_0

[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324954	valid_1's auc: 1	valid_1's binary_logloss: 0.0328181
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.029265	valid_1's auc: 1	valid_1's binary_logloss: 0.0295542
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263693	valid_1's auc: 1	valid_1's binary_logloss: 0.0266288
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.023771	valid_1's auc: 1	valid_1's binary_logloss: 0.0240039
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.0214372	valid_1's auc: 1	valid_1's binary_logloss: 0.0216465
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193393	valid_1's auc: 1	valid_1's binary_logloss: 0.0195275
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.0174522	valid_1's auc: 1	valid_1's binary_logloss: 0.0176215
[24]	valid_0's auc: 1	valid_0's binary_logloss: 0.0157535	valid_1's auc: 1	valid_1's binary_logloss: 0.015906
[25]	valid_0's auc: 1	valid_0's binary_logloss: 0.0142237	valid_1's auc: 1	valid_1's binary_logloss: 0.0143611
[26]

[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234046	valid_1's auc: 1	valid_1's binary_logloss: 0.237247
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193767	valid_1's auc: 1	valid_1's binary_logloss: 0.196256
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165421	valid_1's auc: 1	valid_1's binary_logloss: 0.167453
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.143431	valid_1's auc: 1	valid_1's binary_logloss: 0.145131
[5]	valid_0's auc: 1	valid_0's binary_logloss: 0.125556	valid_1's auc: 1	valid_1's binary_logloss: 0.127002
[6]	valid_0's auc: 1	valid_0's binary_logloss: 0.110625	valid_1's auc: 1	valid_1's binary_logloss: 0.111869
[7]	valid_0's auc: 1	valid_0's binary_logloss: 0.0979298	valid_1's auc: 1	valid_1's binary_logloss: 0.0990084
[8]	valid_0's auc: 1	valid_0's binary_logloss: 0.0870008	valid_1's auc: 1	valid_1's binary_logloss: 0.0879423
[9]	valid_0's auc: 1	valid_0's binary_logloss: 0.0775069	valid_1's auc:

[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324969	valid_1's auc: 1	valid_1's binary_logloss: 0.0328196
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.0292664	valid_1's auc: 1	valid_1's binary_logloss: 0.0295556
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263706	valid_1's auc: 1	valid_1's binary_logloss: 0.02663
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.0237721	valid_1's auc: 1	valid_1's binary_logloss: 0.024005
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.0214382	valid_1's auc: 1	valid_1's binary_logloss: 0.0216475
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193402	valid_1's auc: 1	valid_1's binary_logloss: 0.0195284
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.017453	valid_1's auc: 1	valid_1's binary_logloss: 0.0176223
[24]	valid_0's auc: 1	valid_0's binary_logloss: 0.0157543	valid_1's auc: 1	valid_1's binary_logloss: 0.0159068
[25]	valid_0's auc: 1	valid_0's binary_logloss: 0.0142244	valid_1's auc: 1	valid_1's binary_logloss: 0.0143617
[26]	

[31]	valid_0's auc: 1	valid_0's binary_logloss: 0.00773575	valid_1's auc: 1	valid_1's binary_logloss: 0.00780972
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234045	valid_1's auc: 1	valid_1's binary_logloss: 0.237246
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234054	valid_1's auc: 1	valid_1's binary_logloss: 0.237255
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193774	valid_1's auc: 1	valid_1's binary_logloss: 0.196263
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165428	valid_1's auc: 1	valid_1's binary_logloss: 0.167459
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.143437	valid_1's auc: 1	valid_1's binary_logloss: 0.145137
[5]	valid_0's auc: 1	valid_0's binary_logloss: 0.125562	valid_1's auc: 1	valid_1's binary_logloss: 0.127007
[6]	valid_0's auc: 1	valid_0's binary_logloss: 0.11063	valid_1's auc: 1	valid_1's binary_logloss: 0.111874
[7]	valid_0's auc: 1	valid_0's binar

[16]	valid_0's auc: 1	valid_0's binary_logloss: 0.0361035	valid_1's auc: 1	valid_1's binary_logloss: 0.036464
[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324952	valid_1's auc: 1	valid_1's binary_logloss: 0.0328179
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.0292648	valid_1's auc: 1	valid_1's binary_logloss: 0.029554
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263692	valid_1's auc: 1	valid_1's binary_logloss: 0.0266286
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.0237708	valid_1's auc: 1	valid_1's binary_logloss: 0.0240037
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.021437	valid_1's auc: 1	valid_1's binary_logloss: 0.0216463
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193392	valid_1's auc: 1	valid_1's binary_logloss: 0.0195274
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.0174521	valid_1's auc: 1	valid_1's binary_logloss: 0.0176214
[24]	valid_0's auc: 1	valid_0's binary_logloss: 0.0157534	valid_1's auc: 1	valid_1's binary_logloss: 0.0159059
[25]

[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234045	valid_1's auc: 1	valid_1's binary_logloss: 0.237246
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193766	valid_1's auc: 1	valid_1's binary_logloss: 0.196255
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165421	valid_1's auc: 1	valid_1's binary_logloss: 0.167452
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.14343	valid_1's auc: 1	valid_1's binary_logloss: 0.14513
[5]	valid_0's auc: 1	valid_0's binary_logloss: 0.125556	valid_1's auc: 1	valid_1's binary_logloss: 0.127001
[6]	valid_0's auc: 1	valid_0's binary_logloss: 0.110625	valid_1's auc: 1	valid_1's binary_logloss: 0.111868
[7]	valid_0's auc: 1	valid_0's binary_logloss: 0.0979293	valid_1's auc: 1	valid_1's binary_logloss: 0.0990078
[8]	valid_0's auc: 1	valid_0's binary_logloss: 0.0870003	valid_1's auc: 1	valid_1's binary_logloss: 0.0879418
[9]	valid_0's auc: 1	valid_0's binary_logloss: 0.0775064	valid_1's auc: 1

[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324954	valid_1's auc: 1	valid_1's binary_logloss: 0.0328181
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.029265	valid_1's auc: 1	valid_1's binary_logloss: 0.0295542
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263693	valid_1's auc: 1	valid_1's binary_logloss: 0.0266288
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.023771	valid_1's auc: 1	valid_1's binary_logloss: 0.0240039
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.0214372	valid_1's auc: 1	valid_1's binary_logloss: 0.0216465
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193393	valid_1's auc: 1	valid_1's binary_logloss: 0.0195275
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.0174522	valid_1's auc: 1	valid_1's binary_logloss: 0.0176215
[24]	valid_0's auc: 1	valid_0's binary_logloss: 0.0157535	valid_1's auc: 1	valid_1's binary_logloss: 0.015906
[25]	valid_0's auc: 1	valid_0's binary_logloss: 0.0142237	valid_1's auc: 1	valid_1's binary_logloss: 0.0143611
[26]

[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234046	valid_1's auc: 1	valid_1's binary_logloss: 0.237247
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193767	valid_1's auc: 1	valid_1's binary_logloss: 0.196256
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165421	valid_1's auc: 1	valid_1's binary_logloss: 0.167453
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.143431	valid_1's auc: 1	valid_1's binary_logloss: 0.145131
[5]	valid_0's auc: 1	valid_0's binary_logloss: 0.125556	valid_1's auc: 1	valid_1's binary_logloss: 0.127002
[6]	valid_0's auc: 1	valid_0's binary_logloss: 0.110625	valid_1's auc: 1	valid_1's binary_logloss: 0.111869
[7]	valid_0's auc: 1	valid_0's binary_logloss: 0.0979298	valid_1's auc: 1	valid_1's binary_logloss: 0.0990084
[8]	valid_0's auc: 1	valid_0's binary_logloss: 0.0870008	valid_1's auc: 1	valid_1's binary_logloss: 0.0879423
[9]	valid_0's auc: 1	valid_0's binary_logloss: 0.0775069	valid_1's auc:

[15]	valid_0's auc: 1	valid_0's binary_logloss: 0.0401417	valid_1's auc: 1	valid_1's binary_logloss: 0.0405451
[16]	valid_0's auc: 1	valid_0's binary_logloss: 0.0361054	valid_1's auc: 1	valid_1's binary_logloss: 0.0364659
[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324969	valid_1's auc: 1	valid_1's binary_logloss: 0.0328196
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.0292664	valid_1's auc: 1	valid_1's binary_logloss: 0.0295556
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263706	valid_1's auc: 1	valid_1's binary_logloss: 0.02663
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.0237721	valid_1's auc: 1	valid_1's binary_logloss: 0.024005
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.0214382	valid_1's auc: 1	valid_1's binary_logloss: 0.0216475
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193402	valid_1's auc: 1	valid_1's binary_logloss: 0.0195284
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.017453	valid_1's auc: 1	valid_1's binary_logloss: 0.0176223
[24]	

[31]	valid_0's auc: 1	valid_0's binary_logloss: 0.00773575	valid_1's auc: 1	valid_1's binary_logloss: 0.00780972
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234045	valid_1's auc: 1	valid_1's binary_logloss: 0.237246
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234054	valid_1's auc: 1	valid_1's binary_logloss: 0.237255
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193774	valid_1's auc: 1	valid_1's binary_logloss: 0.196263
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165428	valid_1's auc: 1	valid_1's binary_logloss: 0.167459
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.143437	valid_1's auc: 1	valid_1's binary_logloss: 0.145137
[5]	valid_0's auc: 1	valid_0's binary_logloss: 0.125562	valid_1's auc: 1	valid_1's binary_logloss: 0.127007
[6]	valid_0's auc: 1	valid_0's binary_logloss: 0.11063	valid_1's auc: 1	valid_1's binary_logloss: 0.111874
[7]	valid_0's auc: 1	valid_0's binar

[16]	valid_0's auc: 1	valid_0's binary_logloss: 0.0361035	valid_1's auc: 1	valid_1's binary_logloss: 0.036464
[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324952	valid_1's auc: 1	valid_1's binary_logloss: 0.0328179
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.0292648	valid_1's auc: 1	valid_1's binary_logloss: 0.029554
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263692	valid_1's auc: 1	valid_1's binary_logloss: 0.0266286
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.0237708	valid_1's auc: 1	valid_1's binary_logloss: 0.0240037
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.021437	valid_1's auc: 1	valid_1's binary_logloss: 0.0216463
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193392	valid_1's auc: 1	valid_1's binary_logloss: 0.0195274
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.0174521	valid_1's auc: 1	valid_1's binary_logloss: 0.0176214
[24]	valid_0's auc: 1	valid_0's binary_logloss: 0.0157534	valid_1's auc: 1	valid_1's binary_logloss: 0.0159059
[25]

[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234045	valid_1's auc: 1	valid_1's binary_logloss: 0.237246
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193766	valid_1's auc: 1	valid_1's binary_logloss: 0.196255
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165421	valid_1's auc: 1	valid_1's binary_logloss: 0.167452
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.14343	valid_1's auc: 1	valid_1's binary_logloss: 0.14513
[5]	valid_0's auc: 1	valid_0's binary_logloss: 0.125556	valid_1's auc: 1	valid_1's binary_logloss: 0.127001
[6]	valid_0's auc: 1	valid_0's binary_logloss: 0.110625	valid_1's auc: 1	valid_1's binary_logloss: 0.111868
[7]	valid_0's auc: 1	valid_0's binary_logloss: 0.0979293	valid_1's auc: 1	valid_1's binary_logloss: 0.0990078
[8]	valid_0's auc: 1	valid_0's binary_logloss: 0.0870003	valid_1's auc: 1	valid_1's binary_logloss: 0.0879418
[9]	valid_0's auc: 1	valid_0's binary_logloss: 0.0775064	valid_1's auc: 1

[16]	valid_0's auc: 1	valid_0's binary_logloss: 0.0361037	valid_1's auc: 1	valid_1's binary_logloss: 0.0364643
[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324954	valid_1's auc: 1	valid_1's binary_logloss: 0.0328181
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.029265	valid_1's auc: 1	valid_1's binary_logloss: 0.0295542
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263693	valid_1's auc: 1	valid_1's binary_logloss: 0.0266288
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.023771	valid_1's auc: 1	valid_1's binary_logloss: 0.0240039
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.0214372	valid_1's auc: 1	valid_1's binary_logloss: 0.0216465
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193393	valid_1's auc: 1	valid_1's binary_logloss: 0.0195275
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.0174522	valid_1's auc: 1	valid_1's binary_logloss: 0.0176215
[24]	valid_0's auc: 1	valid_0's binary_logloss: 0.0157535	valid_1's auc: 1	valid_1's binary_logloss: 0.015906
[25]

[29]	valid_0's auc: 1	valid_0's binary_logloss: 0.00947177	valid_1's auc: 1	valid_1's binary_logloss: 0.00956257
[30]	valid_0's auc: 1	valid_0's binary_logloss: 0.00855956	valid_1's auc: 1	valid_1's binary_logloss: 0.0086415
[31]	valid_0's auc: 1	valid_0's binary_logloss: 0.00773617	valid_1's auc: 1	valid_1's binary_logloss: 0.00781013
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234054	valid_1's auc: 1	valid_1's binary_logloss: 0.237255
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.234046	valid_1's auc: 1	valid_1's binary_logloss: 0.237247
Training until validation scores don't improve for 30 rounds
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0.193767	valid_1's auc: 1	valid_1's binary_logloss: 0.196256
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0.165421	valid_1's auc: 1	valid_1's binary_logloss: 0.167453
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0.143431	valid_1's auc: 1	valid_1's binary_logloss: 0.145131
[5]	valid_0's auc: 1	valid

[16]	valid_0's auc: 1	valid_0's binary_logloss: 0.0361054	valid_1's auc: 1	valid_1's binary_logloss: 0.0364659
[17]	valid_0's auc: 1	valid_0's binary_logloss: 0.0324969	valid_1's auc: 1	valid_1's binary_logloss: 0.0328196
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0.0292664	valid_1's auc: 1	valid_1's binary_logloss: 0.0295556
[19]	valid_0's auc: 1	valid_0's binary_logloss: 0.0263706	valid_1's auc: 1	valid_1's binary_logloss: 0.02663
[20]	valid_0's auc: 1	valid_0's binary_logloss: 0.0237721	valid_1's auc: 1	valid_1's binary_logloss: 0.024005
[21]	valid_0's auc: 1	valid_0's binary_logloss: 0.0214382	valid_1's auc: 1	valid_1's binary_logloss: 0.0216475
[22]	valid_0's auc: 1	valid_0's binary_logloss: 0.0193402	valid_1's auc: 1	valid_1's binary_logloss: 0.0195284
[23]	valid_0's auc: 1	valid_0's binary_logloss: 0.017453	valid_1's auc: 1	valid_1's binary_logloss: 0.0176223
[24]	valid_0's auc: 1	valid_0's binary_logloss: 0.0157543	valid_1's auc: 1	valid_1's binary_logloss: 0.0159068
[25]	

## 6. 모델 내보내기

In [31]:
import pickle
from sklearn.externals import joblib

joblib.dump(gridcv, 'model.pkl')

['model.pkl']