In [None]:
%%time
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd /content/Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh
%cd ../
! git clone --recursive https://github.com/Microsoft/LightGBM
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;    

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 75 (delta 33), reused 20 (delta 5), pack-reused 0[K
Unpacking objects: 100% (75/75), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 81.0MB/s 
Collecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/67/c3/6bed87f3b1e5ed2f34bd58bf7978e308c86e255193916be76e5a5ce5dfca/tweepy-3.10.0-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/b7/21/9e2c0dbf9df856e6392a1aec1d18006c60b175aa4e31d351e8278a8a63c0/JPype1-1.2.0-cp36-cp36m-manylinux2010_x86_64.whl (453kB)
[K     |██

In [None]:
import MeCab
import re
import random
import time
import datetime
import numpy as np      
import pandas as pd       

import matplotlib.pyplot as plt   
from IPython.display import Image
from collections import Counter
from tqdm import tqdm 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

In [None]:
def preprocess(text):
    # remove hyperlinks
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
    #remove some puncts (except . ! ?)
    text=re.sub(r'[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '',text)
    text=" ".join(text.split())
    return text

mecab = MeCab.Tagger()
def mecab_morphs(text):
    morphs = []
    pattern = re.compile(".*\t[A-Z]+") 
    # konlpy의 mecab output 형태와 같게 만들어주기
    temp = [tuple(pattern.match(token).group(0).split("\t")) for token in mecab.parse(text).splitlines()[:-1]]
    
    for token in temp:
        morphs.append(token[0])
    return morphs

In [None]:
%%time
train = pd.read_csv("/content/drive/My Drive/dacon/news_train.csv") # train.csv 불러오기
test = pd.read_csv("/content/drive/My Drive/dacon/news_test.csv") # test.csv 불러오기
train["id"] = train["n_id"].astype(str) + '_' + train["ord"].astype(str)
train.drop(['n_id', 'ord'],axis = 1) 

train['clean_text'] = train['content'].apply(preprocess)
test['clean_text'] = test['content'].apply(preprocess)
train['clean_title'] = train['title'].apply(preprocess)
test['clean_title'] = test['title'].apply(preprocess)

random.seed(2020) 
train_nsm_list=list(train[train['info']!=1].index)
train_nsmishing=random.sample(train_nsm_list, 71813) 
random.seed(2020)
train_sm_list=list(train[train['info']==1].index)
train_smishing=random.sample(train_sm_list, 46932)
train_xx=train.loc[train_smishing+train_nsmishing,:].reset_index(drop=True)
train_yy=pd.DataFrame(train['info'],columns=['info']) 
train_yyy=train_yy.loc[train_smishing+train_nsmishing,:].reset_index(drop=True)

train_doc = [(mecab_morphs(x1), mecab_morphs(x2), y) for x1, x2, y in tqdm(zip(train_xx['clean_text'], train_xx['clean_title'], train_yyy['info']))]
test_doc = [(mecab_morphs(x1), mecab_morphs(x2)) for x1, x2 in tqdm(zip(test['clean_text'], test['clean_title']))]

stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']
def get_model_input(_words): #필요없는 단어들 없애는 함수
    global stopwords
    _words = [x for x in _words if x[0] not in stopwords]
    _words = [x for x in _words if x[:-1] not in stopwords]
    for i in range(len(_words)-1):
        yield _words[i]
        
train_text, train_title, Y_train = [], [], []
for lwords in train_doc:
    Y_train.append(lwords[2])
    temp = []
    for x in get_model_input(lwords[0]):
        if len(x) != 1:
            temp.append("{}".format(x))
    train_text.append(" ".join(temp))
    temp = []
    for x in get_model_input(lwords[1]):
        if len(x) != 1:
            temp.append("{}".format(x))
    train_title.append(" ".join(temp))    

test_text, test_title = [], []
for lwords in test_doc:
    temp = []
    for x in get_model_input(lwords[0]):
        if len(x) != 1:
            temp.append("{}".format(x))
    test_text.append(" ".join(temp))
    temp = []
    for x in get_model_input(lwords[1]):
        if len(x) != 1:
            temp.append("{}".format(x))
    test_title.append(" ".join(temp))    

vectorizer=TfidfVectorizer(ngram_range=(1, 3),
    min_df=2,   
    max_features=15000,
    sublinear_tf=True,
    lowercase=False,
    use_idf=True)
vec_train_title= vectorizer.fit_transform(train_title)
vec_test_title= vectorizer.transform(test_title)
vec_train_text= vectorizer.fit_transform(train_text)
vec_test_text= vectorizer.transform(test_text)


118745it [00:19, 6126.54it/s]
142565it [00:22, 6360.20it/s]


CPU times: user 1min 3s, sys: 1.06 s, total: 1min 4s
Wall time: 1min 9s


In [None]:
x_train = pd.read_csv("/content/drive/MyDrive/dacon/final/train.csv") # train.csv 불러오기
x_val = pd.read_csv("/content/drive/MyDrive/dacon/final/val.csv")
x_test = pd.read_csv("/content/drive/MyDrive/dacon/final/test.csv")
y_train = x_train['info']
y_val = x_val['info']
x_train = x_train.drop(['info','clean_text','clean_title'], axis=1)
x_val = x_val.drop(['info','clean_text','clean_title'], axis=1)

In [None]:
x_train['title_emb'] = x_train['title_emb'].astype('category')
x_train['text_emb'] = x_train['text_emb'].astype('category')
x_val['title_emb'] = x_val['title_emb'].astype('category')
x_val['text_emb'] = x_val['text_emb'].astype('category')


In [None]:
#train with more sample resulted 0.95 (+0.02) but + 230 seconds
lgbm1 = LGBMClassifier(eval_set =[(x_val,y_val)],
                       is_unbalance = 'True', 
                       learning_rate = 0.0062, 
                       max_depth = 31, 
                       min_data_in_leaf = 7, 
                       n_estimators = 3900, 
                       num_leaves = 54,
                       device='gpu')
lgbm1.fit(x_train, y_train)



LightGBMError: ignored

In [None]:
%%time
y_test_pred1=lgbm1.predict_proba(test_input)
y_test_pred1_one= [1 if i[1] >= 0.5 else 0 for i in y_test_pred1]
y_test_pred1_one = y_test_pred1_one[:142565]

In [None]:
%%time
submission = pd.DataFrame(
{"id" : test.id,
"info" : y_test_pred1_one})
submission.head()
submission.to_csv("mecab_lightGBM_titlebody.csv", index = False)

CPU times: user 194 ms, sys: 0 ns, total: 194 ms
Wall time: 197 ms
