In [19]:
import pandas as pd
df = pd.read_csv("credibility_dataset.csv")
X = df[['body', 'title']]
y = df['label'].tolist()

In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

pipe = Pipeline([('tfidf',TfidfVectorizer(min_df=3, max_df=10)), ("clf", LogisticRegression())])
np.mean(cross_val_score(pipe, X["title"], y, cv=10)), np.mean(cross_val_score(pipe, X["body"], y, cv=10))

(0.866622646802691, 0.9214263887346552)

In [32]:
import tensorflow as tf
import tensorflow_hub as hub

def texts_encoder(texts):
    with tf.Graph().as_default():
        embed = hub.Module("https://tfhub.dev/google/nnlm-ja-dim128/1")
        embeddings = embed(texts)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.tables_initializer())
            result = sess.run(embeddings)
    return result


class  MeanEmbeddingVectorizer:
    def __init__(self):
        pass

    def fit(self, X, y):
        return self

    def transform(self, X):
        if isinstance(X, pd.core.series.Series):
            X = X.tolist()
        return texts_encoder(X)

In [10]:
type(X['title'])

pandas.core.series.Series

In [37]:
pipe = Pipeline([('w2v',MeanEmbeddingVectorizer()), ("clf", LogisticRegression())])
np.mean(cross_val_score(pipe, X["title"], y, cv=10)), np.mean(cross_val_score(pipe, X["body"], y, cv=10))

INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e68

(0.9849258120637222, 0.9846317732046735)

In [18]:
class  TfIdfWord2VecVectorizer:
    def __init__(self):
        pass

    def fit(self, X, y):
        self.tfidf_model = TfidfVectorizer(min_df=3, max_df=10).fit(X)
        return self

    def transform(self, X):
        return self.tfidf_and_w2v(X, self.tfidf_model)
    
    def texts_encoder(self, texts):
        with tf.Graph().as_default():
            embed = hub.Module("https://tfhub.dev/google/nnlm-ja-dim128/1")
            embeddings = embed(texts)
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                result = sess.run(embeddings)
        return result
    
    def tfidf_and_w2v(self, X, tfidf_model):
        tmp = tfidf_model.transform(X)
        a = tmp.toarray()
        if isinstance(X, pd.core.series.Series):
            X = X.tolist()
        b = self.texts_encoder(X)
        return np.hstack((a, b))

In [75]:
pipe = Pipeline([('vector',TfIdfWord2VecVectorizer()), ("clf", LogisticRegression())])
np.mean(cross_val_score(pipe, X["title"], y, cv=10)), np.mean(cross_val_score(pipe, X["body"], y, cv=10))

INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e68

(0.9878243676588914, 0.9863709085099875)

In [21]:
import tensorflow as tf
import tensorflow_hub as hub

pipe = Pipeline([('vector',TfIdfWord2VecVectorizer()), ("clf", LogisticRegression())])
pipe.fit(X['body'], y)

  from ._conv import register_converters as _register_converters


INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings


Pipeline(memory=None,
     steps=[('vector', <__main__.TfIdfWord2VecVectorizer object at 0x7f6456f95ba8>), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [22]:
import pickle
with open("example_model.pkl", "wb") as f:
    pickle.dump(pipe, f)

In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

In [23]:
with open("example_model.pkl", "rb") as f:
    clf = pickle.load(f)

In [7]:
import pandas as pd
test_data = pd.read_csv("test_data.csv")

In [8]:
test_data.head()

Unnamed: 0,body,label,title,url
0,本因坊 文裕 （ もん ゆう ） （ ２ ９ ） ＝ 井山 裕 太 九 段 ＝ が 、 挑戦...,False,読み の 文裕 、 貫禄 ７ 連覇 達成 \n,http://mainichi.jp/articles/20180702/k00/00m/0...
1,下野 市長 （ 栃木 県 ） 広瀬 寿雄 氏 （ ５ ９ ） ＝ 無 現 ［ 自 ］ ［ 公...,False,広瀬 寿雄 氏 が 無 投票 で ４ 選 \n,http://mainichi.jp/senkyo/articles/20180702/k0...
2,サッカー の スペイン 代表 Ｍ Ｆ イニエスタ （ ３ ４ ） が １ 日 、 ワールドカ...,False,イニエスタ が スペイン 代表 引退 表明 \n,http://mainichi.jp/articles/20180702/k00/00e/0...
3,自動車 の Ｆ１ シリーズ 第 ９ 戦 、 オーストリア ・ グランプリ （ ＧＰ ） は ...,False,フェルスタッペン 今季 初 Ｖ オーストリア ＧＰ \n,http://mainichi.jp/articles/20180702/k00/00m/0...
4,１ 日 の 東アジア 地域 包括 的 経済 連携 （ ＲＣＥＰ ） 閣僚 会合 は 、 １ ...,False,米 警戒 も 、 合意 実現 は 不透明 難航 分野 多く \n,http://mainichi.jp/articles/20180702/k00/00m/0...


In [24]:
y_pred = clf.predict(test_data['body'])

INFO:tensorflow:Initialize variable module/embeddings/part_0:0 from checkpoint b'/tmp/tfhub_modules/22c95b8e6878e472b6647f65afb10d88cafc52e0/variables/variables' with embeddings


In [29]:
from sklearn.metrics import roc_auc_score, accuracy_score
roc_auc_score(test_data['label'], y_pred), accuracy_score(test_data['label'], y_pred)

(0.7179506933744222, 0.6697205680256527)