In [1]:
import MeCab
import numpy as np
import pandas as pd
import fasttext as ft
from glob import glob

In [2]:
# ニュース読み込み
paths = glob('../predict_data/*/*.csv')
contents_list = [pd.read_csv(path,encoding='utf-8',index_col=0)["content"] for path in paths]
contents = [c for inner_list in contents_list for c in inner_list]

# テキストの重複を削除
contents = list(set(contents)) 

In [3]:
# 極性辞書の読み込み
pndic_df = pd.read_csv('../dic/pndic_2018-05-25.csv',
                    encoding='utf-8',
                    index_col = 0,                
                   )

# 辞書型に変換
word_list = list(pndic_df['Word'])
pn_list = list(pndic_df['PN'])
pn_dict = dict(zip(word_list, pn_list))

In [4]:
# 極性単語カウント>= 15の条件でニューステキストを抽出
text_list = []

for text in contents:
    count = 0
    for w in pn_dict.keys():
        if w in text:
            count += 1            
    if count >= 15:
        text_list.append(text)    

In [5]:
# スコアリング
pnmeans_list = [np.mean([pn_dict[w]*text.count(w) for w in pn_dict.keys() if w in text]) for text in text_list]

In [6]:
# テキストを分かち書き
m = MeCab.Tagger("-Owakati") 

news_text = [m.parse(t) for t in text_list]

In [7]:
# 極性辞書評価
dic_df = pd.DataFrame({'dic_score':pnmeans_list,
                        'text': news_text
                       },
                       columns=['dic_score', 'text']
                      )
# PN値の昇順でソート
dic_df = dic_df.sort_values(by='dic_score')

In [10]:
# ポジネガ条件抽出
posi_df = dic_df[dic_df.dic_score>0]
nega_df = dic_df[dic_df.dic_score<0]

text_list = list(posi_df['text'])
p_label = ["P" for i in range(len(text_list))]
posi_df['label'] = p_label

text_list = list(nega_df['text'])
n_label = ["N" for i in range(len(text_list))]
nega_df['label'] = n_label

concat_df = pd.concat([posi_df,nega_df])
concat_df = concat_df.drop("dic_score", axis=1)

# indexを振り直す
re_df = concat_df.reset_index(drop=True)
re_df.to_csv("test_news.csv")