### json 불러와서 저장하기

In [1]:
import numpy as np
import pandas as pd
import json
import jpype
import glob
from random import shuffle
from bs4 import BeautifulSoup as bs

data_path = "data"
#data_path = "data(test)"

file_list = glob.glob("%s/*.json" % data_path)

shuffle(file_list)

json_train=[]

for json_file_name in file_list:
    json_file = json.loads(open(json_file_name).read())
    json_train += json_file["articles"]

### json에서 title 형태소 추출

In [2]:
from konlpy.tag import Mecab

mecab = Mecab()

labeled_train = []

for cnt, article in enumerate(json_train):
    
    if cnt % 10000 == 0:
        print(cnt)
        
    title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])]

    labeled_train.append({
            "istroll": article["is_troll"],
            "title_pos": title_pos,
            "title_pos_sentences" : " ".join(title_pos),
            "pk": article["pk"]
        })

labeled_train = pd.DataFrame.from_dict(labeled_train)
labeled_train = labeled_train.set_index('pk')

labeled_train

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000


KeyboardInterrupt: 

In [None]:
used_model = ""

### bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def make_bag_of_words(labeled_train, max_features, col_name):
    global used_model
    used_model = "bow"
    
    vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
                                stop_words = None, max_features=max_features)
    
    train_data_features = vectorizer.fit_transform(labeled_train[col_name]).toarray()

    col = ["bow_%s_%s" % (col_name, data) for data in vectorizer.get_feature_names()]
    df_bow = pd.DataFrame(train_data_features, columns = col, index=labeled_train.index)
    
    labeled_train = pd.concat([labeled_train, df_bow],axis=1)
    
    return labeled_train

## LDA

In [None]:
import gensim
from gensim import corpora, models
from gensim.corpora import TextCorpus, MmCorpus, Dictionary
from nltk.corpus import stopwords
from optparse import OptionParser

def make_lda(train, keep_n, num_topics, col_name):
    global used_model
    used_model = "lda"

    data = train[col_name]

    dictionary = corpora.Dictionary(data)
    dictionary.filter_extremes(keep_n=keep_n)

    corpus = [dictionary.doc2bow(text) for text in data]

    print("Make Lda..")

    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=1000, passes=1)
    
    num = len(train)    
    df = []
    
    for i in range(0,num):
        if i % 10000 == 0:
            print(i)
        
        temp = [i[1] for i in lda.get_document_topics(corpus[i],minimum_probability=0)]
        df.append(temp)
    
    col = ["lda_%s_%d" % (col_name, data) for data in range(0, num_topics)]
    df = pd.DataFrame(df, columns = col)
    df.index = train.index
    
    train = pd.concat([train, df], axis=1)
        
    return train

## Word2Vec

In [None]:
from gensim.models import word2vec

def make_feature_vec(words, model, num_features):
    
    feature_vec = np.zeros((num_features,), dtype = "float32")
    
    nwords = 0
    
    index2word_set = set(model.index2word)
    
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec, model[word])
    
    if nwords != 0:
        feature_vec = np.divide(feature_vec, nwords)
    
    return feature_vec

def get_avg_feature_vecs(texts, model, num_features):
    
    counter = 0
    
    text_feature_vecs = np.zeros((len(texts), num_features), dtype = "float32")
    
    for i, text in enumerate(texts):
        
        if i % 10000 == 0:
            print(i)
        
        text_feature_vecs[i] = make_feature_vec(text , model, num_features)
        
    return text_feature_vecs

def make_word2vec(train, col_name, max_features):
    global used_model
    used_model = "word2vec"
    
    num_features = max_features
    min_word_count = 40
    num_workers = -1
    context = 10
    downsampling = 1e-3
    
    sentences = " ".join(train[col_name].apply(lambda x:" ".join(x)))
    
    model = word2vec.Word2Vec(sentences, workers = num_workers, size = num_features,\
                             min_count = min_word_count, window = context, sample = downsampling)
    
    col = ["word2vec_%s_%d" % (col_name, data) for data in range(0, num_features)]
    
    train_feature = get_avg_feature_vecs(train[col_name].apply(lambda x:" ".join(x)), model, num_features)
    train_feature = pd.DataFrame(train_feature, index = train.index, columns = col)
    
    train = pd.concat([train, train_feature], axis = 1)
    
    return train

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def make_tf_idf(train, max_features, col_name):
    global used_model
    used_model = "tf-idf"
    
    vectorizer = TfidfVectorizer(min_df = 1, max_features = max_features)
    
    train_data_features = vectorizer.fit_transform(train[col_name]).toarray()
    
    col = ["tfidf_%s_%s" % (col_name, data) for data in vectorizer.get_feature_names()]
    df_tfidf = pd.DataFrame(train_data_features, columns = col, index=train.index)
    
    train = pd.concat([train, df_tfidf],axis=1)
    
    return train

In [None]:
max_features = 1000
#%time labeled_train = make_lda(labeled_train, 5000, 1000, "title_pos")
#%time labeled_train = make_bag_of_words(labeled_train, 300, "title_pos_sentences")
#labeled_train = make_tf_idf(labeled_train, max_features, "title_pos_sentences")
labeled_train = make_word2vec(labeled_train,"title_pos_sentences",max_features)

### predictor, model 생성

In [None]:
from sklearn.ensemble import RandomForestClassifier

label = 'istroll'
pre = labeled_train.columns.drop(['title_pos', 'title_pos_sentences', label])

model = RandomForestClassifier(n_estimators=10, n_jobs=-1)

### cross validation

In [None]:
from sklearn import cross_validation

cv_value = 3

scores = cross_validation.cross_val_score(model, labeled_train[pre], labeled_train[label], cv=cv_value, scoring="roc_auc")
cv_result = scores.mean()

print(cv_result)

## Result
  * title의 morphs를 bag of words로 변환 (feature : 1,000개) - 0.612752193303
  * title의 pos를 bag of words로 변환 (feature : 1,000개) - 0.630053006037
  * 데이터를 셔플하고 title의 pos를 bag of words로 변환 (feature : 1,000개) - 0.658894903487
  * 데이터를 셔플하고 title의 pos를 tf-idf로 변환 (feature : 1,000개) - 0.648875263729
  
  * title의 pos를 LDA로 변환 (keep_n : 5,000, num_topics : 1,000) - 0.592446285012

### Train Accuracy와 Cross Validation Accuracy 측정 후 그래프 비교

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score

def calcul_score(labeled_train):
    label = 'istroll'
    predictors = labeled_train.columns.drop([label, 'author_pos', 'author_pos_sentences', 'author'])

    alg = RandomForestClassifier(n_estimators = 10, n_jobs = -1)
    kf = KFold(len(labeled_train), n_folds=3, shuffle=True)
    
    train_score = []
    cv_score = []
    for train_index, test_index in kf:
        alg.fit(labeled_train.loc[labeled_train.index[train_index], predictors], labeled_train.loc[labeled_train.index[train_index], label])
        
        train_predicted = alg.predict_proba(labeled_train.loc[labeled_train.index[train_index], predictors])
        train_score.append(roc_auc_score(labeled_train.loc[labeled_train.index[train_index], label], train_predicted.T[1]))
        
        cv_predicted = alg.predict_proba(labeled_train.loc[labeled_train.index[test_index], predictors])
        cv_score.append(roc_auc_score(labeled_train.loc[labeled_train.index[test_index], label], cv_predicted.T[1]))
    
    train_score = np.mean(train_score)
    cv_score = np.mean(cv_score)
    
    print("train score = %s\ntest score = %s" % (train_score, cv_score))
    
    return [train_score, cv_score]

In [None]:
result = []
for i in range(1, 11):
    # %time bow_labeled_train = make_bag_of_words(labeled_train, 100 * i, "author_pos_sentences")
    %time labeled_train = make_lda(labeled_train, 1000, 100 * i, "author_pos")
    # %time labeled_train = make_word2vec(labeled_train, "author_pos", 100 * i)
    result.append(calcul_score(bow_labeled_train))

In [None]:
%matplotlib inline
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
plt.ioff()

In [None]:
x = [x * 100 for x in range(1, 11)]

plt.plot(x, np.array(result).T[0], label="train")
plt.plot(x, np.array(result).T[1], label="cross validation")
plt.legend(loc=4)
plt.show()

### pickle로 파일 저장

In [None]:
import os
import pickle

if not os.path.exists(used_model):
    os.makedirs(used_model)

pickle.dump(labeled_train[pre], open("%s/title_%d.p" % (used_model, max_features), "wb"), protocol = pickle.HIGHEST_PROTOCOL)