### json 불러와서 저장하기

In [None]:
import numpy as np
import pandas as pd
import json
import jpype
import glob
from random import shuffle
from bs4 import BeautifulSoup as bs

data_path = "data"
#data_path = "data(test)"

file_list = glob.glob("%s/*.json" % data_path)
shuffle(file_list)

json_train=[]
for json_file_name in file_list:
    json_file = json.loads(open(json_file_name).read())
    json_train += json_file["articles"]

### json에서 text 형태소 추출

In [None]:
from konlpy.tag import Mecab

mecab = Mecab()

labeled_train = []

for cnt, article in enumerate(json_train):
    text = bs(article["text"], "html.parser").text
    
    if cnt % 10000 == 0:
        print(cnt)
        
    text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)]

    labeled_train.append({
            "istroll": article["is_troll"],
            "text_pos": text_pos,
            "text_pos_sentences" : " ".join(text_pos),
            "pk": article["pk"]
        })

labeled_train = pd.DataFrame.from_dict(labeled_train)
labeled_train = labeled_train.set_index('pk')

labeled_train

In [None]:
used_model = ""

### bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def make_bag_of_words(labeled_train, max_features, col_name):
    global used_model
    used_model = "bow"
    
    vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
                                stop_words = None, max_features=max_features)
    
    train_data_features = vectorizer.fit_transform(labeled_train[col_name]).toarray()

    col = ["bow_%s_%s" % (col_name, data) for data in vectorizer.get_feature_names()]
    df_bow = pd.DataFrame(train_data_features, columns = col, index=labeled_train.index)
    
    labeled_train = pd.concat([labeled_train, df_bow],axis=1)
    
    return labeled_train

## LDA

In [None]:
import gensim
from gensim import corpora, models
from gensim.corpora import TextCorpus, MmCorpus, Dictionary
from nltk.corpus import stopwords
from optparse import OptionParser

def make_lda(train, keep_n, num_topics, col_name):
    global used_model
    used_model = "lda"

    data = train[col_name]

    dictionary = corpora.Dictionary(data)
    dictionary.filter_extremes(keep_n=keep_n)

    corpus = [dictionary.doc2bow(text) for text in data]

    print("Make Lda..")

    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=1000, passes=1)
    
    num = len(train)    
    df = []
    
    for i in range(0,num):
        if i % 10000 == 0:
            print(i)
        
        temp = [i[1] for i in lda.get_document_topics(corpus[i],minimum_probability=0)]
        df.append(temp)
    
    col = ["lda_%s_%d" % (col_name, data) for data in range(0, num_topics)]
    df = pd.DataFrame(df, columns = col)
    df.index = train.index
    
    train = pd.concat([train, df], axis=1)
        
    return train

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def make_tf_idf(train, max_features, col_name):
    global used_model
    used_model = "tf-idf"
    
    vectorizer = TfidfVectorizer(min_df = 1, max_features = max_features)
    
    train_data_features = vectorizer.fit_transform(train[col_name]).toarray()
    
    col = ["tfidf_%s_%s" % (col_name, data) for data in vectorizer.get_feature_names()]
    df_tfidf = pd.DataFrame(train_data_features, columns = col, index=train.index)
    
    train = pd.concat([train, df_tfidf],axis=1)
    
    return train

In [None]:
max_features = 1000
#%time labeled_train = make_lda(labeled_train, 5000, 1000, "text_pos")
#%time labeled_train = make_bag_of_words(labeled_train, 1000, "text_pos_sentences")
labeled_train = make_tf_idf(labeled_train, max_features, "text_pos_sentences")

### predictor, model 생성

In [None]:
from sklearn.ensemble import RandomForestClassifier

label = 'istroll'
pre = labeled_train.columns.drop(['text_pos', 'text_pos_sentences', label])

model = RandomForestClassifier(n_estimators=10, n_jobs=-1)

### cross validation

In [None]:
from sklearn import cross_validation

cv_value = 3

scores = cross_validation.cross_val_score(model, labeled_train[pre], labeled_train[label], cv=cv_value, scoring="roc_auc")
cv_result = scores.mean()

print(cv_result)

## Result
  * text의 morphs를 bag of words로 변환 (feature : 1,000개) - 0.637526911627
  * text의 pos를 bag of words로 변환 (feature : 1,000개) - 0.661265083065
  * 데이터를 셔플하고 text의 pos를 bag of words로 변환 (feature : 1,000개) - 0.688875771784
  
  * 데이터를 셔플하고 text의 pos를 tf-idf로 변환 (feature : 1,000개) - 0.683697553571
  
  * text의 pos를 lda로 변환 (keep_n : 5,000개, num_topics : 1,000개) - 0.643143549702
  * bs의 파라미터를 "lxml"이 아닌 "html.parser"로 주고 pos를 bag of words로 변환 (feature : 1,000개) - 0.657726939833

In [None]:
import os
import pickle

if not os.path.exists(used_model):
    os.makedirs(used_model)

pickle.dump(labeled_train[pre], open("%s/text_%d.p" % (used_model, max_features), "wb"), protocol = pickle.HIGHEST_PROTOCOL)