

####  그래프 구성
- 노드 : 뉴스기사 
- 엣지 : claim 칼럼으로 4개의 토픽 분류 후, 같은 토픽끼리 엣지로 연결

#### 피쳐
- claim (문서 임베딩 100차원)
- youtube0 (문서 임베딩 100차원)
- claim 길이, 주관값, 감성값 


#### 알고리즘
- GraphSAGE (https://arxiv.org/abs/1706.02216 )

- 데이터 불러오기

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')

In [3]:
test = pd.read_csv('test.csv')

In [4]:
train.shape, test.shape

((5277, 16), (1856, 15))

In [5]:
test['label'] = np.nan

In [6]:
# train id 와 구분하기위해 5277 더해줌
test['id'] = test['id'] + 5277

In [7]:
df =pd.concat( [ train, test ], axis = 0).reset_index(drop = True)

- claim 기준 같은 주제끼리 엣지로 연결하기
- topic modeling 활용

In [18]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer

from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer('english')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\likeo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [20]:
doc_sample = df.claim[0]
doc_sample

'Did a Vermont Woman Post a Joke About Trump Bringing Back Slavery ?'

In [21]:
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Did', 'a', 'Vermont', 'Woman', 'Post', 'a', 'Joke', 'About', 'Trump', 'Bringing', 'Back', 'Slavery', '?']


 tokenized and lemmatized document: 
['vermont', 'woman', 'post', 'joke', 'trump', 'bring', 'slaveri']


In [22]:
processed_docs = df['claim'].map(preprocess)


In [23]:
processed_docs

0       [vermont, woman, post, joke, trump, bring, sla...
1       [basebal, player, photograph, kneel, protest, ...
2       [wisconsin, state, john, nygren, hit, dirt, pr...
3       [progress, group, accus, senat, splinter, grou...
4       [seal, unusu, stripe, pattern, mark, spot, was...
                              ...                        
7128    [commentari, covid, diari, clinician, choic, s...
7129    [facebook, instagram, post, share, thousand, t...
7130                  [antiparasit, drug, kill, sar, day]
7131    [video, view, thousand, time, facebook, alongs...
7132    [whatsapp, messag, circul, south, sudan, claim...
Name: claim, Length: 7133, dtype: object

-  Bag of Words on the Data set


In [24]:
dictionary = gensim.corpora.Dictionary(processed_docs)



In [25]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 bring
1 joke
2 post
3 slaveri
4 trump
5 vermont
6 woman
7 basebal
8 kneel
9 lynch
10 photograph


In [27]:
dictionary.filter_extremes(no_below=1, no_above=0.8, keep_n=100000)

In [28]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(19, 1),
 (147, 1),
 (199, 1),
 (256, 1),
 (352, 1),
 (765, 1),
 (4048, 1),
 (5655, 1)]

In [37]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=4, id2word=dictionary, passes=50, workers=50)

In [38]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.018*"say" + 0.008*"florida" + 0.007*"sander" + 0.007*"democrat" + 0.007*"claim" + 0.006*"care" + 0.006*"health" + 0.006*"texa" + 0.006*"california" + 0.006*"right"
Topic: 1 
Words: 0.043*"coronavirus" + 0.026*"covid" + 0.015*"claim" + 0.013*"facebook" + 0.013*"video" + 0.012*"post" + 0.011*"show" + 0.011*"time" + 0.010*"share" + 0.009*"pandem"
Topic: 2 
Words: 0.067*"covid" + 0.014*"vaccin" + 0.009*"death" + 0.005*"state" + 0.005*"test" + 0.005*"studi" + 0.005*"peopl" + 0.005*"like" + 0.005*"pandem" + 0.004*"report"
Topic: 3 
Words: 0.054*"trump" + 0.027*"say" + 0.022*"donald" + 0.016*"clinton" + 0.013*"claim" + 0.011*"obama" + 0.011*"presid" + 0.010*"hillari" + 0.008*"state" + 0.007*"american"


In [40]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [42]:
topic_table = make_topictable_per_doc(lda_model, bow_corpus)

In [44]:
topic_table[0].value_counts()

3.0    2348
1.0    1721
2.0    1576
0.0    1488
Name: 0, dtype: int64

In [45]:
df['topic_category'] = topic_table[0]

In [46]:
df['topic_category'].isnull().any()

False

그래프 만들기

In [51]:
edge_list_topic0 = df[df['topic_category'] == 0].id.to_list()
edge_list_topic1 = df[df['topic_category'] == 1].id.to_list()
edge_list_topic2 = df[df['topic_category'] == 2].id.to_list()
edge_list_topic3 = df[df['topic_category'] == 3].id.to_list()

In [52]:
from itertools import combinations

edge_list_topic0 = list(combinations(edge_list_topic0, 2))
edge_list_topic1 = list(combinations(edge_list_topic1, 2))
edge_list_topic2 = list(combinations(edge_list_topic2, 2))
edge_list_topic3 = list(combinations(edge_list_topic3, 2))


In [54]:
import networkx as nx


In [55]:
#build the graph
G = nx.Graph()
G.add_edges_from(edge_list_topic0)
G.add_edges_from(edge_list_topic1)
G.add_edges_from(edge_list_topic2)
G.add_edges_from(edge_list_topic3)

In [57]:
G.add_nodes_from(df.id.tolist())

In [58]:
node_list = list(df.id)

In [59]:
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 7133
Number of edges: 6582866
Average degree: 1845.7496


In [None]:
nx.set_node_attributes(G, df.set_index('id')['claim'], 'claim')
nx.set_node_attributes(G, df.set_index('id')['published_date'], 'published_date')
nx.set_node_attributes(G, df.set_index('id')['keybert_keywords'], 'keybert_keywords')
nx.set_node_attributes(G, df.set_index('id')['ner_keywords'], 'ner_keywords')
nx.set_node_attributes(G, df.set_index('id')['youtube0'], 'youtube0')
nx.set_node_attributes(G, df.set_index('id')['youtube1'], 'youtube1')
nx.set_node_attributes(G, df.set_index('id')['youtube2'], 'youtube2')
nx.set_node_attributes(G, df.set_index('id')['youtube3'], 'youtube3')
nx.set_node_attributes(G, df.set_index('id')['youtube4'], 'youtube4')
nx.set_node_attributes(G, df.set_index('id')['youtube5'], 'youtube5')
nx.set_node_attributes(G, df.set_index('id')['youtube6'], 'youtube6')
nx.set_node_attributes(G, df.set_index('id')['youtube7'], 'youtube7')
nx.set_node_attributes(G, df.set_index('id')['youtube8'], 'youtube8')
nx.set_node_attributes(G, df.set_index('id')['youtube9'], 'youtube9')

In [61]:
nx.set_node_attributes(G, df.set_index('id')['label'], 'label')

In [62]:
nodelist = []

for n, d in G.nodes(data = True):
    node_data = dict()
    node_data['node'] = n
    node_data.update(d)
    nodelist.append(node_data)

node_from_G = pd.DataFrame(nodelist)

In [63]:
node_from_G.shape

(7133, 16)

### make feature

In [160]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec


In [68]:
node_from_G.columns

Index(['node', 'claim', 'published_date', 'keybert_keywords', 'ner_keywords',
       'youtube0', 'youtube1', 'youtube2', 'youtube3', 'youtube4', 'youtube5',
       'youtube6', 'youtube7', 'youtube8', 'youtube9', 'label'],
      dtype='object')

In [69]:
node_from_G.label.value_counts()

0.0    2641
1.0    2636
Name: label, dtype: int64

1. claim (문장 임베딩)

In [288]:
processed_claim = node_from_G['claim'].map(preprocess)

In [290]:
processed_claim.shape

(7133,)

In [289]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(processed_claim)]
model_doc = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)

for i in range(len(df)):
    if i == 0 :
        claim_feature = pd.DataFrame(model_doc.infer_vector(processed_claim[i])).T
    else :
        claim_feature = pd.concat([claim_feature, pd.DataFrame(model_doc.infer_vector(processed_claim[i])).T])
        

In [292]:
claim_feature.shape

(7133, 100)

2. youtube0 (문장 임베딩)

In [328]:
for i in range(10):
    globals()[f"processed_youtube{i}".format(i)]  = node_from_G[f'youtube{i}'.format(i)].map(preprocess)


In [329]:
processed_youtube0.shape

(7133,)

In [330]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(processed_youtube0)]
model_doc = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)

for i in range(len(df)):
    if i == 0 :
        you_feature0 = pd.DataFrame(model_doc.infer_vector(processed_youtube0[i])).T
    else :
        you_feature0 = pd.concat([you_feature0, pd.DataFrame(model_doc.infer_vector(processed_youtube0[i])).T])
        

In [315]:
you_feature0.shape

(7133, 100)

3. claim 길이, 감성, 주관성 값 feature 추가 

- len

In [317]:
claims_lenth = node_from_G['claim'].map(len)

In [318]:
df_claim_feature2 = pd.DataFrame(claims_lenth)
df_claim_feature2 = df_claim_feature2.rename(columns = {'body':'lenth'})

- textblob

In [319]:
from textblob import TextBlob

In [320]:
TextBlob(node_from_G['claim'][800]).sentiment

Sentiment(polarity=-0.25, subjectivity=0.4)

In [321]:
for i in tqdm(range(len(node_from_G['claim']))):
    df_claim_feature2.loc[i, 'polarity'] = TextBlob(node_from_G['claim'][i]).sentiment[0]
    df_claim_feature2.loc[i, 'subjectivity'] = TextBlob(node_from_G['claim'][i]).sentiment[1]

100%|█████████████████████████████████████████████████████████████████████████████| 7133/7133 [00:11<00:00, 603.01it/s]


In [322]:
df_claim_feature2.head()

Unnamed: 0,claim,polarity,subjectivity
0,65,0.0,0.0
1,62,0.0,0.0
2,70,-0.1,0.4
3,80,0.0,0.0
4,61,-0.8,0.9


In [353]:
df_claim_feature2['node'] = node_from_G.node

In [377]:
claim_feature['node'] = list(node_from_G.node)

In [374]:
you_feature0['node'] = list(node_from_G.node)

In [378]:
a = pd.merge(df_claim_feature2,claim_feature, how = 'inner', on = 'node')

In [379]:
b = pd.merge(a, you_feature0, how = 'inner', on = 'node')

In [381]:
node = b.set_index('node')
node.head()

Unnamed: 0_level_0,claim,polarity,subjectivity,0_x,1_x,2_x,3_x,4_x,5_x,6_x,...,90_y,91_y,92_y,93_y,94_y,95_y,96_y,97_y,98_y,99_y
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,65,0.0,0.0,0.004347,0.006527,-0.005255,0.00498,-0.000329,-0.007928,0.001403,...,0.025669,-0.046734,-0.042978,-0.056621,0.067007,-0.109707,0.009479,-0.016747,0.058125,0.034603
3,62,0.0,0.0,0.003562,0.011921,-0.005304,0.00887,-0.002444,-0.002014,0.008863,...,0.048587,-0.085624,-0.077225,-0.098361,0.121312,-0.187347,0.026443,-0.044678,0.099377,0.063381
12,70,-0.1,0.4,0.008384,0.010987,-0.00432,0.007628,0.001655,-0.009427,0.012194,...,0.02646,-0.042766,-0.034709,-0.041442,0.049989,-0.092175,0.012842,-0.017332,0.052157,0.03233
15,80,0.0,0.0,0.003019,0.007246,-0.004502,0.01318,-0.003171,-0.01577,0.014981,...,0.00445,-0.024444,-0.022037,-0.013704,0.028149,-0.039714,0.001728,-0.013094,0.024016,0.012437
19,61,-0.8,0.9,-0.000194,0.00807,-0.000448,0.004562,0.003866,-0.004962,0.001487,...,0.015711,-0.022911,-0.020754,-0.019913,0.027065,-0.042241,0.008415,-0.008206,0.023729,0.011469


In [382]:
edge_list = nx.to_pandas_edgelist(G, nodelist=node_list)

In [383]:
edge_list.shape

(6582866, 2)

In [384]:
edge_list.head()

Unnamed: 0,source,target
0,0,5
1,0,7
2,0,8
3,0,11
4,0,13


---

In [385]:
from stellargraph import StellarGraph


In [386]:
graph_full = StellarGraph({"node":node}, {"edge":edge_list}, source_column = "source",
                              target_column = "target")

In [387]:
print(graph_full.info())


StellarGraph: Undirected multigraph
 Nodes: 7133, Edges: 6582866

 Node types:
  node: [7133]
    Features: float32 vector, length 203
    Edge types: node-edge->node

 Edge types:
    node-edge->node: [6582866]
        Weights: all 1 (default)
        Features: none


In [388]:
train_id = train.id.to_list()
test_id = test.id.to_list()

In [389]:
test_id[0]

5277

In [390]:
hold_out_nodes = test_id

In [391]:
labels_sampled = train[['id', 'label']]
labels_sampled = labels_sampled.rename(columns = {'id':'node'})

In [392]:
labels_sampled = labels_sampled.set_index('node')

서브그래프(과제의 train data 해당하는 노드)를 추출한다.

In [393]:
graph_sampled = graph_full.subgraph(train_id)

In [394]:
print(graph_sampled.info())

StellarGraph: Undirected multigraph
 Nodes: 5277, Edges: 4018567

 Node types:
  node: [5277]
    Features: float32 vector, length 203
    Edge types: node-edge->node

 Edge types:
    node-edge->node: [4018567]
        Weights: all 1 (default)
        Features: none


In [395]:
from sklearn import preprocessing, feature_extraction, model_selection


In [598]:
train_labels, test_labels = model_selection.train_test_split(
    labels_sampled,
    train_size= 0.4,
    test_size=None,
    stratify=labels_sampled,
    random_state=42,
)
val_labels, test_labels = model_selection.train_test_split(
    test_labels, train_size=0.2, test_size=None, stratify=test_labels, random_state=100,
)

In [599]:
def encode_label(labels):
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    labels = to_categorical(labels)
    return labels#, #label_encoder.classes_

In [600]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical


In [601]:
train_targets = encode_label(train_labels)

In [602]:
val_targets = encode_label(val_labels)
test_targets = encode_label(test_labels)

In [None]:
## Creating the GraphSAGE model in Keras

In [611]:
batch_size = 100
num_samples = [10, 10]

In [612]:
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE
from tensorflow.keras import layers, optimizers, losses, metrics, Model


In [613]:
generator = GraphSAGENodeGenerator(graph_sampled, batch_size, num_samples)

train_gen = generator.flow(train_labels.index, train_targets, shuffle=True)

graphsage_model = GraphSAGE(
    layer_sizes=[128, 128], generator=generator, bias=True, dropout=0.3,
)

x_inp, x_out = graphsage_model.in_out_tensors()
prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

prediction.shape

TensorShape([None, 2])

In [614]:
## 모델 학습

In [615]:
model = Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=optimizers.Adam(lr=0.001),
    loss=losses.categorical_crossentropy,
    metrics=["acc"],
)

In [616]:
val_gen = generator.flow(val_labels.index, val_targets)

In [617]:
history = model.fit(
    train_gen, epochs=5, validation_data=val_gen, verbose=1, shuffle=False
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [618]:
test_gen = generator.flow(test_labels.index, test_targets)

In [619]:
len(hold_out_nodes)


1856

In [620]:
## 제출할 데이터 만들기

generator = GraphSAGENodeGenerator(graph_full, batch_size, num_samples)

len(hold_out_nodes)

hold_out_targets = np.zeros((1856,2))

hold_out_gen = generator.flow(hold_out_nodes, hold_out_targets)

hold_out_predictions = model.predict(hold_out_gen)

submit = pd.DataFrame(hold_out_predictions)

submit.loc[submit[1]>=0.5, 'result'] = 1

submit.loc[submit[1]<0.5, 'result'] = 0

In [621]:
test_node_id = test[['id']].reset_index(drop = True)

result = pd.concat([test_node_id,submit], axis = 1)

submit_1 = result[['id','result']].rename(columns = {'node_id':0, 'result':1})

# submit_1.to_csv('.csv',index=False, sep = '\t')

In [622]:
submit_1['id'] = submit_1['id'] - 5277

In [625]:
submit_1.to_csv('submit_1113.csv')

In [624]:
submit_1[1].value_counts()

0.0    1465
1.0     391
Name: 1, dtype: int64