In [3]:
from gensim.models import Word2Vec
import pandas as pd

model = Word2Vec.load("./model_Word2Vec")
all_data = pd.read_csv("./no_cut_recovery_all.csv")

## 建立模型設置

In [4]:
from tensorflow.python.keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

embedding_matrix = np.zeros((len(model.wv.vocab.items()) + 1, model.vector_size))
word2idx = {}
PADDING_LENGTH = 500
vocab_list = [(word, model.wv[word]) for word, _ in model.wv.vocab.items()]

for i, vocab in enumerate(vocab_list):
    word, vec = vocab
    embedding_matrix[i + 1] = vec
    word2idx[word] = i + 1

embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            trainable=False)

#### mapping to index
def text_to_index(corpus):
    new_corpus = []
    for doc in corpus:
        new_doc = []
        for word in doc:
            try:
                new_doc.append(word2idx[word])
            except:
                new_doc.append(0)
        new_corpus.append(new_doc)
    return np.array(new_corpus)

def new_model():
    model = Sequential()
    model.add(embedding_layer)
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

## 匯入模型

In [5]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, Activation, LSTM
import tensorflow as tf
# load model
json_file = open('./lstm.json', 'r')
lstm_loaded_model_json = json_file.read()
lstm = tf.keras.models.model_from_json(lstm_loaded_model_json)
lstm.load_weights('./lstm.h5')

## 建立篩選/計算結果

In [6]:
#篩選他
def create_select_data(data):
    select = input()
    select_id = []
    for i in range(0, len(data)):
        if select in data["title"][i] :
            select_id.append(i)
            #print(i, data["title"][i])
    select_data = data.loc[select_id]
    select_data = select_data.reset_index(drop = True)
    
    return select_data

#幫data加上一個預測完的欄位
def preds_data(data):
    #變成 list
    all_text = data["recovery"].tolist()
    
    #預測他
    X_test = text_to_index(all_text)
    X_test = pad_sequences(X_test, maxlen=PADDING_LENGTH)
    Y_preds = lstm.predict(X_test)
    Y_preds_label = np.argmax(Y_preds, axis=1)
    
    data["Y_preds_label"] = Y_preds_label
    
    return data


#產生 group他計算正負項
#這裡的data必須要有Y_preds_label欄位

def create_count_positivedata_negativedata(data):
    group_total = data.groupby("articleID")["Y_preds_label"].count()
    group_positive = data.groupby("articleID")["Y_preds_label"].sum()
    #合併上面兩個項目(上面是serise)
    group = pd.DataFrame({'total':group_total, 'positive_total':group_positive})
    
    group['articleID'] = group.index
    group = group.reset_index(drop = True)
    
    #負面的有幾個
    group["negative_total"] = group["total"] - group["positive_total"]
    
    count_all_data = pd.merge(group,
                          data[['articleID', 'url', 'title', 'createTime']],
                          on='articleID',how = 'left')
    
    #刪除重複的資料+重製index
    count_all_data = count_all_data.drop_duplicates(subset='articleID', keep='first', inplace=False)
    count_all_data = count_all_data.reset_index(drop = True)
    
    #出現計算趴數欄位
    count_all_data["positive_percent"] = count_all_data["positive_total"] / count_all_data["total"]
    count_all_data["negative_percent"] = count_all_data["negative_total"] / count_all_data["total"]
    
    #排整齊
    count_all_data = count_all_data[['articleID', 'url', 'title', 'createTime', 'positive_total', 'negative_total', 'total', 'positive_percent', 'negative_percent']]
    overfive = count_all_data[count_all_data["total"] > 3]
    
    return overfive


#產出正向前三名的兩個list

def top3_positivedata(data):
    positive_top = data.nlargest(3,'positive_percent')
    
    positive_url = positive_top["url"].tolist()
    positive_percent = positive_top["positive_percent"].tolist()
    
    return positive_url, positive_percent


#產出負向前三名的兩個list

def top3_negativedata(data):
    negative_top = data.nlargest(3,'negative_percent')
    
    negative_url = negative_top["url"].tolist()
    negative_percent = negative_top["negative_percent"].tolist()
    
    return negative_url, negative_percent


#一步到底的感覺
def all_step(data):
    select_data = create_select_data(data)
    preds = preds_data(select_data)
    count = create_count_positivedata_negativedata(preds)
    top3_positive_url = top3_positivedata(count)[0]
    top3_positive_percent = top3_positivedata(count)[1]
    top3_negative_url = top3_negativedata(count)[0]
    top3_negative_percent = top3_negativedata(count)[1]
    
    return top3_positive_url, top3_positive_percent, top3_negative_url, top3_negative_percent

## 使用他

In [7]:
output = all_step(all_data)

信義


  return np.array(new_corpus)


## 輸出看看

In [8]:
print("positive url list：" ,output[0])
print("positive percent list：" ,output[1])

print("========================")

print("negative url list：" ,output[2])
print("negative percent list：" ,output[3])

positive url list： ['https://www.myhousing.com.tw//index.php?option=com_kunena&view=topic&catid=8&id=100493&Itemid=0', 'https://www.mobile01.com/topicdetail.php?f=454&t=6019348', 'https://www.mobile01.com/topicdetail.php?f=454&t=6030435']
positive percent list： [1.0, 0.6666666666666666, 0.5229357798165137]
negative url list： ['https://www.mobile01.com/topicdetail.php?f=454&t=6056969', 'https://www.mobile01.com/topicdetail.php?f=454&t=5943880', 'https://www.mobile01.com/topicdetail.php?f=454&t=5911768']
negative percent list： [0.8, 0.7727272727272727, 0.75]
