### json 불러와서 저장하기

In [2]:
import numpy as np
import pandas as pd
import json
import jpype
import glob
from random import shuffle
from bs4 import BeautifulSoup as bs

#data_path = "data"
data_path = "data(test)"

file_list = glob.glob("%s/*.json" % data_path)

json_train=[]

shuffle(file_list)
for json_file_name in file_list:
    json_file = json.loads(open(json_file_name).read())
    json_train += json_file["articles"]

### json에서 author, author 형태소, forumid 추출

In [3]:
from konlpy.tag import Mecab

mecab = Mecab()

labeled_train = []

for cnt, article in enumerate(json_train):
    if cnt % 10000 == 0:
        print(cnt)
    author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])]

    labeled_train.append({
            "istroll": article["is_troll"],
            "author": article["author"],
            "author_pos": author_pos,
            "author_pos_sentences" : " ".join(author_pos),
            "forumid": article["forumid"],
            "pk": article["pk"]
        })

labeled_train = pd.DataFrame.from_dict(labeled_train)
labeled_train = labeled_train.set_index('pk')

labeled_train

0


Unnamed: 0_level_0,author,author_pos,author_pos_sentences,forumid,istroll
pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,세부이장,"[세부_NNG, 이장_NNG]",세부_NNG 이장_NNG,마을회관밭,False
2,관리자,[관리자_NNG],관리자_NNG,잡담밭,False
3,snoopy,[snoopy_SL],snoopy_SL,잡담밭,False
4,ISCARIOT,[ISCARIOT_SL],ISCARIOT_SL,정치사회밭,False
5,1234,[1234_SN],1234_SN,연예인밭,False
6,호놀룰루,[호놀룰루_NNP],호놀룰루_NNP,정치사회밭,False
7,영양죽,"[영_NNG, 양죽_NNG]",영_NNG 양죽_NNG,잡담밭,False
8,영양죽,"[영_NNG, 양죽_NNG]",영_NNG 양죽_NNG,연예인밭,False
9,fakebook,[fakebook_SL],fakebook_SL,잡담밭,False
10,그나,[그나_MAJ],그나_MAJ,잡담밭,False


### forumid와 author을 label encoder로 숫자로 바꿔줌

In [4]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

labeled_train["forumid"] = le.fit_transform(labeled_train["forumid"])
labeled_train["author"] = le.fit_transform(labeled_train["author"])

labeled_train

Unnamed: 0_level_0,author,author_pos,author_pos_sentences,forumid,istroll
pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,49,"[세부_NNG, 이장_NNG]",세부_NNG 이장_NNG,3,False
2,22,[관리자_NNG],관리자_NNG,5,False
3,11,[snoopy_SL],snoopy_SL,5,False
4,2,[ISCARIOT_SL],ISCARIOT_SL,6,False
5,0,[1234_SN],1234_SN,4,False
6,68,[호놀룰루_NNP],호놀룰루_NNP,6,False
7,57,"[영_NNG, 양죽_NNG]",영_NNG 양죽_NNG,5,False
8,57,"[영_NNG, 양죽_NNG]",영_NNG 양죽_NNG,4,False
9,7,[fakebook_SL],fakebook_SL,5,False
10,24,[그나_MAJ],그나_MAJ,5,False


### bag of words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

def make_bag_of_words(labeled_train, max_features, col_name):
    vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
                                stop_words = None, max_features=max_features)
    
    train_data_features = vectorizer.fit_transform(labeled_train[col_name]).toarray()

    col = ["bow_%s_%s" % (col_name, data) for data in vectorizer.get_feature_names()]
    df_bow = pd.DataFrame(train_data_features, columns = col, index=labeled_train.index)
    
    labeled_train = pd.concat([labeled_train, df_bow],axis=1)
    
    return labeled_train

## LDA

In [6]:
import gensim
from gensim import corpora, models
from gensim.corpora import TextCorpus, MmCorpus, Dictionary
from nltk.corpus import stopwords
from optparse import OptionParser

def make_lda(train, keep_n, num_topics, col_name):

    data = train[col_name]

    dictionary = corpora.Dictionary(data)
    dictionary.filter_extremes(keep_n=keep_n)

    corpus = [dictionary.doc2bow(text) for text in data]

    print("Make Lda..")

    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=1000, passes=1)
    
    num = len(train)    
    df = []
    
    for i in range(0,num):
        if i % 10000 == 0:
            print(i)
        
        temp = [i[1] for i in lda.get_document_topics(corpus[i],minimum_probability=0)]
        df.append(temp)
    
    col = ["lda_%s_%d" % (col_name, data) for data in range(0, num_topics)]
    df = pd.DataFrame(df, columns = col)
    df.index = train.index
    
    train = pd.concat([train, df], axis=1)
        
    return train

In [7]:
max_features = 1000
#%time labeled_train = make_lda(labeled_train, 1000, 200, "author_pos")
%time labeled_train = make_bag_of_words(labeled_train, 1000, "author_pos_sentences")

CPU times: user 6.05 ms, sys: 884 µs, total: 6.94 ms
Wall time: 6.31 ms


### predictor, model 생성

In [8]:
from sklearn.ensemble import RandomForestClassifier

label = 'istroll'
pre = labeled_train.columns.drop(['author_pos', 'author_pos_sentences', 'author', label])

model = RandomForestClassifier(n_estimators=10, n_jobs=-1)

### cross validation

In [18]:
from sklearn import cross_validation

cv_value = 3

scores = cross_validation.cross_val_score(model, labeled_train[pre], labeled_train[label], cv=cv_value, scoring="roc_auc")

cv_result = scores.mean()

print(cv_result)

  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspec(init)
  args, varargs, kw, default = inspect.getargspe

IndexError: index 1 is out of bounds for axis 1 with size 1

## Result

 * author를 Label Encoder로 변환: 0.67331271058
 * author의 morphs를 Bag of Words(feature = 1,000 개)로 변환: 0.733333689652
 * author의 pos를 Bag of Words(feature = 1,000 개)로 변환: 0.769833271386
 * 데이터를 shuffle하고 pos를 Bag of Words(feautre = 1,000개)로 변환: 0.81172740086
 
 
 * author의 pos를 LDA로 변환 (num_topics = 200, keep_n = 1,000) : 0.748718319834
 * 데이터를 shuffle하고 author의 pos를 LDA로 변환 (num_topics = 200, keep_n = 1,000) : 0.772313052394

In [19]:
import pickle

pre = labeled_train.columns.drop(['author_pos', 'author_pos_sentences', 'author'])

pickle.dump(labeled_train[pre], open("author, forum_%d.p" % max_features, "wb"), protocol = pickle.HIGHEST_PROTOCOL)