### json 불러와서 저장하기

In [None]:
import numpy as np
import pandas as pd
import json
import jpype
import glob
from random import shuffle
from bs4 import BeautifulSoup as bs

#data_path = "data"
data_path = "data(test)"

file_list = glob.glob("%s/*.json" % data_path)

json_train=[]

shuffle(file_list)
for json_file_name in file_list:
    json_file = json.loads(open(json_file_name).read())
    json_train += json_file["articles"]

### json에서 author, author 형태소, forumid 추출

In [None]:
from konlpy.tag import Mecab

mecab = Mecab()

labeled_train = []

for cnt, article in enumerate(json_train):
    if cnt % 10000 == 0:
        print(cnt)
    author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])]

    labeled_train.append({
            "istroll": article["is_troll"],
            "author": article["author"],
            "author_pos": author_pos,
            "author_pos_sentences" : " ".join(author_pos),
            "forumid": article["forumid"],
            "pk": article["pk"]
        })

labeled_train = pd.DataFrame.from_dict(labeled_train)
labeled_train = labeled_train.set_index('pk')

labeled_train

### forumid와 author을 label encoder로 숫자로 바꿔줌

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

labeled_train["forumid"] = le.fit_transform(labeled_train["forumid"])
labeled_train["author"] = le.fit_transform(labeled_train["author"])

labeled_train

In [None]:
used_model = ""

### bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def make_bag_of_words(labeled_train, max_features, col_name):
    global used_model
    used_model = "bow"
    
    vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None,
                                stop_words = None, max_features=max_features)
    
    train_data_features = vectorizer.fit_transform(labeled_train[col_name]).toarray()

    col = ["bow_%s_%s" % (col_name, data) for data in vectorizer.get_feature_names()]
    df_bow = pd.DataFrame(train_data_features, columns = col, index=labeled_train.index)
    
    labeled_train = pd.concat([labeled_train, df_bow],axis=1)
    
    return labeled_train

## LDA

In [None]:
import gensim
from gensim import corpora, models
from gensim.corpora import TextCorpus, MmCorpus, Dictionary
from nltk.corpus import stopwords
from optparse import OptionParser

def make_lda(train, keep_n, num_topics, col_name):
    global used_model
    used_model = "lda"

    data = train[col_name]

    dictionary = corpora.Dictionary(data)
    dictionary.filter_extremes(keep_n=keep_n)

    corpus = [dictionary.doc2bow(text) for text in data]

    print("Make Lda..")

    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, chunksize=1000, passes=1)
    
    num = len(train)    
    df = []
    
    for i in range(0,num):
        if i % 10000 == 0:
            print(i)
        
        temp = [i[1] for i in lda.get_document_topics(corpus[i],minimum_probability=0)]
        df.append(temp)
    
    col = ["lda_%s_%d" % (col_name, data) for data in range(0, num_topics)]
    df = pd.DataFrame(df, columns = col)
    df.index = train.index
    
    train = pd.concat([train, df], axis=1)
        
    return train

## Word2Vec

In [None]:
from gensim.models import word2vec

def make_feature_vec(words, model, num_features):
    
    feature_vec = np.zeros((num_features,), dtype = "float32")
    
    nwords = 0
    
    index2word_set = set(model.index2word)
    
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec, model[word])
    
    if nwords != 0:
        feature_vec = np.divide(feature_vec, nwords)
    
    return feature_vec

def get_avg_feature_vecs(texts, model, num_features):
    
    counter = 0
    
    text_feature_vecs = np.zeros((len(texts), num_features), dtype = "float32")
    
    for i, text in enumerate(texts):
        
        if i % 10000 == 0:
            print(i)
        
        text_feature_vecs[i] = make_feature_vec(text , model, num_features)
        
    return text_feature_vecs

def make_word2vec(train, col_name, max_features):
    global used_model
    used_model = "word2vec"
    
    num_features = max_features
    min_word_count = 40
    num_workers = -1
    context = 10
    downsampling = 1e-3
    
    sentences = " ".join(train[col_name].apply(lambda x:" ".join(x)))
    
    model = word2vec.Word2Vec(sentences, workers = num_workers, size = num_features,\
                             min_count = min_word_count, window = context, sample = downsampling)
    
    col = ["word2vec_%s_%d" % (col_name, data) for data in range(0, num_features)]
    
    train_feature = get_avg_feature_vecs(train[col_name].apply(lambda x:" ".join(x)), model, num_features)
    train_feature = pd.DataFrame(train_feature, index = train.index, columns = col)
    
    train = pd.concat([train, train_feature], axis = 1)
    
    return train

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def make_tf_idf(train, max_features, col_name):
    global used_model
    used_model = "tf-idf"
    
    vectorizer = TfidfVectorizer(min_df = 1, max_features = max_features)
    
    train_data_features = vectorizer.fit_transform(train[col_name]).toarray()
    
    col = ["tfidf_%s_%s" % (col_name, data) for data in vectorizer.get_feature_names()]
    df_tfidf = pd.DataFrame(train_data_features, columns = col, index=train.index)
    
    train = pd.concat([train, df_tfidf],axis=1)
    
    return train

In [None]:
max_features = 600
#%time labeled_train = make_lda(labeled_train, 1000, 200, "author_pos")
#%time labeled_train = make_bag_of_words(labeled_train, max_features, "author_pos_sentences")
labeled_train = make_word2vec(labeled_train, "author_pos", max_features)
#labeled_train = make_tf_idf(labeled_train, max_features, "author_pos_sentences")

### predictor, model 생성

In [None]:
from sklearn.ensemble import RandomForestClassifier

label = 'istroll'
pre = labeled_train.columns.drop(['author_pos', 'author_pos_sentences', 'author', label])

model = RandomForestClassifier(n_estimators=10, n_jobs=-1)

### cross validation

In [None]:
from sklearn import cross_validation

cv_value = 3

scores = cross_validation.cross_val_score(model, labeled_train[pre], labeled_train[label], cv=cv_value, scoring="roc_auc")

cv_result = scores.mean()

print(cv_result)

## Result

 * author를 Label Encoder로 변환: 0.67331271058
 * author의 morphs를 Bag of Words(feature = 1,000 개)로 변환: 0.733333689652
 * author의 pos를 Bag of Words(feature = 1,000 개)로 변환: 0.769833271386
 * 데이터를 shuffle하고 pos를 Bag of Words(feautre = 1,000개)로 변환: 0.81172740086
 * 데이터를 shuffle하고 pos를 TF-IDF(feature = 1,000개)로 변환 : 0.804858083797
 
 
 * author의 pos를 LDA로 변환 (num_topics = 200, keep_n = 1,000) : 0.748718319834
 * 데이터를 shuffle하고 author의 pos를 LDA로 변환 (num_topics = 200, keep_n = 1,000) : 0.772313052394

In [None]:
import os
import pickle

pre = labeled_train.columns.drop(['author_pos', 'author_pos_sentences', 'author'])

if not os.path.exists(used_model):
    os.makedirs(used_model)

pickle.dump(labeled_train[pre], open("%s/author, forum_%d.p" % (used_model, max_features), "wb"), protocol = pickle.HIGHEST_PROTOCOL)