In [1]:
import csv
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly
import os
import tqdm
import json
from collections import defaultdict
import json_lines
import datetime
import matplotlib.pyplot as plt
from statistics import median
from ipywidgets import interact

In [6]:
path = '../pilgrim data/candidates/'
days = os.listdir(path)

In [8]:
aids = defaultdict(list)
for day in tqdm.tqdm(days):
    with open(path + day, 'r') as f:
        data = json.load(f)
    for aid, info in data:
        aids[day].append(aid)

100%|██████████| 1307/1307 [00:01<00:00, 919.17it/s]


In [32]:
original_path = '../news data/comments/daily_comments/'
article_path = '../news data/articles/all_daily_articles_naver/'
original_days = os.listdir(original_path)


def created_time(createdAt):
    date = createdAt[:10]
    return ''.join(date.split('-'))


def timeline_criteria(timeline, threshold_freq=4, threshold_sum=30, threshold_date=183):
    # 해당 조건을 만족해야만 성지로서의 자격을 1차로 얻을 수 있다고 볼 수 있음.

    # 1. timeline의 길이가 treshold_freq 보다 길어야 함.
    if len(timeline) < threshold_freq:
        return False

    # 2. 해당 기사의 총 댓글 수가 threshold_sum 보다 많아야 함.
    total = sum(list(timeline.values()))
    if total < threshold_sum:
        return False

    # 3. 가장 최근 댓글과 발행일 사이의 간격이 threshold_date 보다 길어야 함.
    published_date = datetime.datetime.strptime(
        list(timeline.keys())[0], '%Y%m%d')
    last_comment_date = datetime.datetime.strptime(
        list(timeline.keys())[-1], '%Y%m%d')
    term = (last_comment_date - published_date).days
    if term < threshold_date:
        return False

    return True


def contributor_criteria(contributors, threshold_sum=10, threshold_monopoly=0.8):
    # 댓글 참여자의 수가 너무 적거나 일부 참여자에게 독점이 심한 경우에는 성지 목록에서 제외함.
    total_conts = len(list(contributors.keys()))
    total_replies = sum(list(contributors.values()))
    if total_conts < threshold_sum:
        return False
    ordered_conts = sorted(contributors.items(),
                           key=lambda x: x[1], reverse=True)
    if sum([count for (user, count) in ordered_conts[:5]]) / total_replies > threshold_monopoly:
        return False
    return True


# 전역변수 생성.
checker = defaultdict(dict)
threshold = 20

for day in tqdm.tqdm(days):
    with open(article_path + day, 'r') as f:
        articles = json.load(f)

    with open(original_path + day, 'r') as f:
        data = json.load(f)

    target_aids = aids[day]
    for aid in target_aids:
        article = articles[aid]
        published_date = datetime.datetime.strptime(
            article['timestamp'][:10], '%Y-%m-%d')
        over_1_year = 0
        comment_timeline = defaultdict(int)
        contributors = defaultdict(int)

        for comment in data[aid]:
            # 타임라인 제작.
            timestamp = created_time(comment['createdAt'])
            comment_timeline[timestamp] += 1

            # 각 comment 별로 1년 넘은 comment인 지 확인.
            last_comment_date = datetime.datetime.strptime(
                comment['createdAt'][:10], '%Y-%m-%d')
            term = (last_comment_date - published_date).days
            if term > 365:
                over_1_year += 1

            # 댓글 작성자 기록을 담는 contributor dictionary 채우기
            writer = comment['userName']
            contributors[writer] += 1

        if timeline_criteria(comment_timeline) and contributor_criteria(contributors) and over_1_year > threshold:
            checker[article['gno']]['title'] = article['title']
            checker[article['gno']]['url'] = article['url']
            checker[article['gno']]['timestamp'] = article['timestamp']
            checker[article['gno']]['sid1s'] = article['sid1s']

            checker[article['gno']]['count'] = over_1_year
            checker[article['gno']]['percentage'] = round(
                over_1_year / len(data[aid]), 3)

            checker[article['gno']]['timeline'] = comment_timeline

100%|██████████| 1307/1307 [1:24:50<00:00,  3.89s/it]


In [40]:
with open('/Volumes/CHANGYEON/NEWS RESEARCH/pilgrim data/result/current_pilgrim_candidate.json', 'w') as f:
    json.dump(checker, f)