In [1]:
from glob import glob
import os
import json
import numpy as np
import pandas as pd
import argparse
import yaml
import torch
import random
from typing import List

from tqdm.auto import tqdm
from konlpy.tag import Mecab
from methods import get_similar_filepath_dict, extract_nouns, extract_text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def torch_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed) # if use multi-GPU 
    # CUDA randomness
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    np.random.seed(random_seed)
    random.seed(random_seed)
    os.environ['PYTHONHASHSEED'] = str(random_seed)

In [8]:
cfg = {
    'datadir' : '/workspace/code/Fake-News-Detection-Dataset/data/Part1',
    'savedir' : '../data-saeran',
    'METHOD': {
        'name'    : 'tfidf_overlap',
        'tfidf_target' : 'full', #title, context, full 
        'query'  : 'context', 
        'document' : 'full',
        'select_name' : 'tfidf_title_category_select',
        'topk' : 20,
        },
    'SEED':42    
}

In [9]:
torch_seed(cfg['SEED'])

# Load files

In [5]:
# update save directory
cfg['savedir'] = os.path.join(cfg['savedir'], cfg['METHOD']['select_name'])

In [6]:
# load file list
file_list = glob(os.path.join(cfg['datadir'], 'train/NonClickbait_Auto/EC/*'))
save_list = [p.replace(cfg['datadir'], cfg['savedir']) for p in file_list]

# Tokenizer

In [10]:
def extract_nouns(file_list: list, target: str = None, join: bool = True) -> List[list]:
    """
    extract nouns from target text
    
    """
    # extract morphs
    mecab = Mecab()

    # define list
    nouns_list = []

    for file_path in tqdm(file_list, desc=f'Extract Morphs({target})', total=len(file_list), leave=False):
        # load source file
        source_file = json.load(open(file_path, "r"))
        
        if target == 'title':
            text = source_file['sourceDataInfo']['newsTitle']
        elif target == 'context': 
            text = source_file['sourceDataInfo']['newsContent']
        elif target == 'full':
            text = source_file['sourceDataInfo']['newsTitle'] + source_file['sourceDataInfo']['newsContent']

        if join:
            nouns_list.append(' '.join(mecab.nouns(text)))
        else:
            nouns_list.append(mecab.nouns(text))

    return nouns_list

# get TFIDF word matrix

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
corpus = cfg['METHOD']['tfidf_target']
corpus_list = extract_nouns(file_list, target=corpus)
len(corpus_list)

                                                                            

20664

In [13]:
tf_idf_model = TfidfVectorizer().fit(corpus_list)

In [14]:
tfidf_tokens = tf_idf_model.get_feature_names()

In [15]:
len(tfidf_tokens)

49849

# convert text to TFIDF vector

In [50]:
q_target = cfg['METHOD']['query']
p_target = cfg['METHOD']['document']

query = extract_nouns(file_list=file_list, target=q_target)
# documents = extract_nouns(file_list=file_list, target=p_target)


                                                                               

In [16]:
documents = corpus_list

AttributeError: 'str' object has no attribute 'copy'

In [25]:
query_tfidf = tf_idf_model.transform(query).toarray()
document_tfidf = tf_idf_model.transform(documents).toarray()

# similarity

In [26]:
cos_sim = cosine_similarity(query_tfidf, document_tfidf)

# Overlapped Token

In [27]:
%%time
top_k = cfg['METHOD']['topk']
topkindex = np.argpartition(cos_sim,-top_k, axis=1)[:,-top_k:] #not sorted (argsort보다 속도 훨씬 빠름)

CPU times: user 3.44 s, sys: 290 ms, total: 3.73 s
Wall time: 3.75 s


In [28]:
# # Check Score
# # df_result = pd.DataFrame(columns=['index','score','index'])
# topkindex_sorted = np.argsort(cos_sim, axis=1)[:,-top_k:]
# for idx in topkindex_sorted[0]:
#     print(f'index : {idx} score : {cos_sim[0][idx]}')

In [29]:
# top-20 documents by similarity on tfidf vectors
# 1. make query's unique token dictionary
# 2. sort top-20 documents by # of overlapped token

query_ids = range(len(query))
token_overlap = dict() #q_id : {d_id : # of overlapped tokens}
for q_id in query_ids:
    token_overlap[q_id] = {}
    topk_docs_ids = topkindex[q_id]
    for d_id in topk_docs_ids:
        count = len(set(query[q_id]) & set(documents[d_id]))
        token_overlap[q_id][d_id] = count

In [30]:
token_overlap_sorted = dict()
for q_id, overlap_cnt_dict in token_overlap.items():
    token_overlap_sorted[q_id] = dict(sorted(overlap_cnt_dict.items(), key=lambda item: item[1], reverse=True))

In [31]:
# {document id : count} -> [document ids]
token_overlap_sorted_idx = [list(d_dict.keys()) for q_id, d_dict in token_overlap_sorted.items()]

# Analysis top-k score & token overlap

In [32]:
# token overlapped의 평균 구하기 (top1 ~ top10) : 보통 몇개정도 겹치는지

In [33]:
# top1과 topk중 ovelapped가 가장 많이 된 것이 다른 query의 개수 구하기

# Train accuracy

In [34]:
def score_accuracy(query_ids, token_overlap_sorted_idx):
    top_1_accuracy = 0.0
    for query_id, indices in zip(query_ids, token_overlap_sorted_idx):
        if query_id == indices[0]: #query와 document로 사용하는 text가 달라야 유의미함.
            top_1_accuracy += 1.0
    top_1_accuracy = round(top_1_accuracy/len(query_ids), 5)

    top_K_accuracy = 0.0
    for query_id, indices in zip(query_ids, token_overlap_sorted_idx):
        if query_id in indices:
            top_K_accuracy += 1.0
            
    top_K_accuracy = round(top_K_accuracy/len(query_ids), 5)
    return top_1_accuracy, top_K_accuracy

In [35]:
top_1_accuracy, top_K_accuracy = score_accuracy(query_ids, token_overlap_sorted_idx)
print(f'TOP_1 accuracy : {top_1_accuracy}')
print(f'TOP_10 accuracy : {top_K_accuracy}') 

TOP_1 accuracy : 0.99748
TOP_10 accuracy : 1.0


# Method Function Code

In [36]:
def tfidf_overlap_sim_matrix(corpus: list, query : list, document : list, **kwargs) -> np.ndarray:
    """
    make similarity matrix using tfidf similarity
    """
    tf_idf_model = TfidfVectorizer().fit(corpus)
    tf_idf_query = tf_idf_model.transform(query).toarray()
    tf_idf_document = tf_idf_model.transform(document).toarray()
    cos_sim = cosine_similarity(tf_idf_query, tf_idf_document)
    
    return cos_sim

In [37]:
def overlap_token(cos_sim, query, documents, top_k = None, Train = False):
    top_k = top_k
    topkindex = np.argpartition(cos_sim,-top_k, axis=1)[:,-top_k:] #not sorted

    # top-20 documents by similarity on tfidf vectors
    # 1. make query's unique token dictionary
    # 2. sort top-20 documents by # of overlapped token

    query_ids = range(len(query))
    token_overlapped = dict() #q_id : {d_id : # of overlapped tokens}
    for q_id in query_ids:
        token_overlapped[q_id] = {}
        topk_docs_ids = topkindex[q_id]
        for d_id in topk_docs_ids:
            count = len(set(query[q_id]) & set(documents[d_id]))
            token_overlapped[q_id][d_id] = count

    token_overlapped_sorted = dict()
    for query, cnt_overlapped_dict in token_overlapped.items():
        token_overlapped_sorted[query] = dict(sorted(cnt_overlapped_dict.items(), key=lambda item: item[1], reverse=True))

    if Train == True:
        # {document id : count} -> [document ids]
        token_overlapped_sorted_idx = [list(d_dict.keys()) for q_id, d_dict in token_overlapped_sorted.items()]
        return token_overlapped_sorted_idx
    
    # masking query_id == documents_id
    for query_id, documents in token_overlapped_sorted.items():
        if query_id in documents.keys():
            documents[query_id] = 0
    results = [list(documents.keys())[0] for query_id, documents in token_overlapped_sorted.items()] 
    results = np.expand_dims(results, axis=1) #[[d_id],[d_id],...,[d_id]]

    return results

# Val accuracy

In [38]:
def validation(cfg, tf_idf_model):
    #load validation file
    file_list = glob(os.path.join(cfg['datadir'], 'validation/NonClickbait_Auto/EC/*'))
    query_ids = range(len(file_list))

    #extract nount from query and documents seperately
    q_target = cfg['METHOD']['query']
    d_target = cfg['METHOD']['document']

    queries = extract_nouns(file_list=file_list, target=q_target)
    documents = extract_nouns(file_list=file_list, target=d_target)

    #get query tfidf and document tfidf
    query_tfidf = tf_idf_model.transform(queries).toarray()
    document_tfidf = tf_idf_model.transform(documents).toarray()

    #calculate similarity
    cos_sim = cosine_similarity(query_tfidf, document_tfidf)

    #get top-k documents
    token_overlapped_sorted_idx = overlap_token(cos_sim, queries, documents, top_k = cfg['METHOD']['topk'], Train = True)

    return query_ids, token_overlapped_sorted_idx


In [39]:
# top-20 passages by similarity on tfidf vectors
# 1. make query's unique token dictionary
# 2. sort top-20 passages by # of overlapped token

val_query_ids, val_token_overlapped_sorted_idx = validation(cfg, tf_idf_model)
top_1_accuracy, top_K_accuracy = score_accuracy(val_query_ids, val_token_overlapped_sorted_idx)
print(f'TOP_1 accuracy : {top_1_accuracy}')
print(f'TOP_10 accuracy : {top_K_accuracy}')

                                                                             

TOP_1 accuracy : 0.99961
TOP_10 accuracy : 1.0


# Check samples

In [40]:
#load validation file
val_file_list = glob(os.path.join(cfg['datadir'], 'validation/NonClickbait_Auto/EC/*'))

for q_id, doc_idx_list in enumerate(val_token_overlapped_sorted_idx[:10]):
    source_file = json.load(open(val_file_list[q_id], "r"))
    title = source_file['sourceDataInfo']['newsTitle']
    
    if doc_idx_list[0] == q_id:
        doc_idx = doc_idx_list[1]
    else:
        doc_idx = doc_idx_list[0]
    doc_file = json.load(open(val_file_list[doc_idx],"r"))
    #doc_title = doc_file['sourceDataInfo']['newsTitle']
    document = doc_file['sourceDataInfo']['newsContent']
    print('-------------title-------------')
    print(title)
    print('-------------document-------------')
    print(document)

-------------title-------------
중기부, '2022 메이커 스타' 참가자 모집
-------------document-------------
CJ온스타일은 15일 상생 프로그램 '챌린지! 스타트업'을 통해 약 3개월의 교육과 평가를 거친 6개 기업을 선발했다고 밝혔다.
선발된 기업에게는 CJ온스타일 방송 진출 기회와 함께 상금 총 2억원이 제공된다. '챌린지! 스타트업'은 스타트업에게 사업 경험 전반을 전수하는 CJ온스타일의 대표 상생 프로그램이다.
선발된 6개 기업은 단계별 심사와 교육 과정에서 우수성을 입증한 혁신 기술 기반 기업이다.
CJ온스타일은 지난 3월 모집한 지원서를 검토해 26개 참가 기업을 1차 선발했다.
서울창업허브와 CJ온스타일은 이들 중 필요한 곳에 상품 기획, 브랜딩 경험, 판로 개척 등에 대한 개별 컨설팅과 시제품 제작 서비스 등을 제공했다.
이후 4월 말 진행한 데모데이와 다면 평가를 통해 선발 기업이 확정됐다.
선발된 기업은 독창적 사업 아이디어와 제품 기술력을 보유했다. '샤플'의 헤어 드라이기는 '가성비 다이슨 헤어 스타일러'라는 별명을 얻을 정도로 스타일링 효과에 대한 평이 좋다. '나인랩'의 텀블러 자외선 살균기는 사용자들이 세척에 불편함을 겪고 있는 점에 착안해 개발됐다. '네츄럴솔루션이엠비씨'는 특정 신체 부위에 음파 자극을 줘 운동효과를 극대화한 제품이다. '더원리빙'의 보온 플레이트와 '플트리스'의 오피스 가드닝 제품, 친환경 운송수단인 '쎄미시스코'의 소형 전기차 EV Z 등도 선발 기업의 대표 제품이다.
CJ온스타일은 선발된 6개 기업의 제품을 올해 자사 방송 프로그램에서 선보일 계획이다.
판매수수료 없는 중소기업 무료방송'1사1명품’에서 주로 내놓으나 상품 특성과 물량 공급 사정에 맞춰 CJ온스타일 모바일 애플리케이션과 모바일 라이브커머스에서도 판매한다.
참여 기업 만족도가 높아 올해 최대 4개 기업을 추가 선발할 계획이다.
-------------title-------------
이케아