### json 불러와서 저장하기

In [114]:
import numpy as np
import pandas as pd
import json
import jpype
import glob
from random import shuffle
from bs4 import BeautifulSoup as bs

data_path = "data"

file_list = glob.glob("%s/*.json" % data_path)

json_train=[]

shuffle(file_list)
for json_file_name in file_list:
    json_file = json.loads(open(json_file_name).read())
    json_train += json_file["articles"]

### json에서 author, author 형태소, forumid 추출

In [115]:
from konlpy.tag import Mecab

mecab = Mecab()

labeled_train = []

for cnt, article in enumerate(json_train):
    if cnt % 10000 == 0:
        print(cnt)

    labeled_train.append({
            "istroll": article["is_troll"],
            "author": article["author"],
            "author_pos": ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])],
            "forumid": article["forumid"],
        })

labeled_train = pd.DataFrame.from_dict(labeled_train)
labeled_train

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000


Unnamed: 0,author,author_pos,forumid,istroll
0,밭갈았농,"[밭_NNG, 갈_VV, 았_EP, 농_XR]",정치사회밭,False
1,휘슬러,[휘슬러_NNP],잡담밭,False
2,내가니하니,"[내_NP, 가_JKS, 니_NP, 하_VV, 니_EC]",잡담밭,False
3,대지대지꿀대지,"[대_XPN, 지대지_NNG, 꿀_VV+ETM, 대지_NNG]",잡담밭,False
4,이계덕기자,"[이계덕_NNP, 기자_NNG]",정치사회밭,True
5,알콜사랑,"[알콜_NNG, 사랑_NNG]",덕후밭,False
6,흐엉얽흐헑,"[흐엉_NNP, 얽_VV, 흐_IC, 헑_UNKNOWN]",잡담밭,False
7,890202,[890202_SN],잡담밭,False
8,치킨이제일로좋닭,"[치킨_NNG, 이제_MAG, 일_NNG, 로_JKB, 좋_VA+ETM, 닭_NNG]",잡담밭,False
9,올해도대풍년,"[올해_NNG, 도_JX, 대풍년_NNG]",잡담밭,False


### Feature Hasher

In [116]:
from sklearn.feature_extraction import FeatureHasher

hasher_features = 1000

hasher = FeatureHasher(input_type="string", n_features=hasher_features)

hashed = hasher.transform(labeled_train["author"])
hashed = pd.DataFrame(hashed.toarray())

hashed.columns = ["author_%d" % author_num for author_num in range(1, hasher_features+1)]

labeled_train = pd.concat([labeled_train, hashed], axis=1)
labeled_train

Unnamed: 0,author,author_pos,forumid,istroll,1,2,3,4,5,6,...,991,992,993,994,995,996,997,998,999,1000
0,밭갈았농,"[밭_NNG, 갈_VV, 았_EP, 농_XR]",정치사회밭,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,휘슬러,[휘슬러_NNP],잡담밭,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,내가니하니,"[내_NP, 가_JKS, 니_NP, 하_VV, 니_EC]",잡담밭,False,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
3,대지대지꿀대지,"[대_XPN, 지대지_NNG, 꿀_VV+ETM, 대지_NNG]",잡담밭,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,이계덕기자,"[이계덕_NNP, 기자_NNG]",정치사회밭,True,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,알콜사랑,"[알콜_NNG, 사랑_NNG]",덕후밭,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,흐엉얽흐헑,"[흐엉_NNP, 얽_VV, 흐_IC, 헑_UNKNOWN]",잡담밭,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,890202,[890202_SN],잡담밭,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,치킨이제일로좋닭,"[치킨_NNG, 이제_MAG, 일_NNG, 로_JKB, 좋_VA+ETM, 닭_NNG]",잡담밭,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,올해도대풍년,"[올해_NNG, 도_JX, 대풍년_NNG]",잡담밭,False,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### DictVectorizer - forum_id

In [117]:
from sklearn.feature_extraction import DictVectorizer

X = labeled_train["forumid"]

temp_list = []
for item in X:
    dic = {"forumid" : item}
    temp_list.append(dic)

dicvec = DictVectorizer()

forumid = dicvec.fit_transform(temp_list).toarray()
forumid = pd.DataFrame(forumid)

forumid.columns = ["%s_%d" % ("forumid", colname) for colname in forumid.columns]

labeled_train = pd.concat([labeled_train, forumid], axis=1)

### predictor, model 생성

In [118]:
from sklearn.ensemble import RandomForestClassifier

label = 'istroll'

pre = labeled_train.columns.drop(['author_pos', 'author', 'forumid', label])

model = RandomForestClassifier(n_estimators=10, n_jobs=6)

###cross validation

In [119]:
from sklearn import cross_validation

cv_value = 3

scores = cross_validation.cross_val_score(model, labeled_train[pre], labeled_train[label], cv=cv_value, scoring="roc_auc")
cv_result = scores.mean()

print(cv_result)

0.846958709489


## Result

 * author를 feature hasher로 변환 (n_features : 1000) : 0.831904399113
 * \+ forumid를 one-hot encoder로 변환 : 0.841882814218

In [120]:
labeled_train.columns

Index(['author', 'author_pos', 'forumid', 'istroll', '1', '2', '3', '4', '5',
       '6', 
       ...
       'forumid_26', 'forumid_27', 'forumid_28', 'forumid_29', 'forumid_30',
       'forumid_31', 'forumid_32', 'forumid_33', 'forumid_34', 'forumid_35'],
      dtype='object', length=1040)

In [121]:
import pickle

pre = labeled_train.columns.drop(['author_pos', 'author', 'forumid'])

pickle.dump(labeled_train[pre], open("author(feature_hasher_%d), forumid(dict_vectorizer).p" % hasher_features, "wb"), protocol = pickle.HIGHEST_PROTOCOL)