In [1]:
import csv
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import os
import tqdm
import json
from collections import defaultdict
import json_lines

In [7]:
with open('../pilgrim data/pilgrim_naver.jsonl', 'r') as json_file:
    json_list = list(json_file)

In [8]:
len(json_list)

277861

In [9]:
import datetime

counter = defaultdict(int)
for a in tqdm.tqdm(json_list):
    article = json.loads(a)
    published_date = datetime.datetime.strptime(article['timestamp'][:10], '%Y-%m-%d')
    over_1_year = 0
    for comment in article['comments']:
        last_comment_date = datetime.datetime.strptime(comment['createdAt'][:10], '%Y-%m-%d')
        term = (last_comment_date - published_date).days
        if term >= 365:
            over_1_year += 1

    counter[over_1_year] += 1
    
#20200212: 365일 포함하도록 수정
with open('../pilgrim data/result/comments_after_1yr.json', 'w') as json_file:
    json.dump(json_file, counter)

100%|██████████| 277861/277861 [03:45<00:00, 1231.54it/s]


TypeError: Object of type TextIOWrapper is not JSON serializable

In [30]:
# 특정 기사들을 찾아보도록 하겠음.
import datetime

def created_time (createdAt):
    date = createdAt[:10]
    return ''.join(date.split('-'))

def timeline_criteria(timeline, threshold_freq = 4, threshold_sum = 30, threshold_date = 183):
    # 해당 조건을 만족해야만 성지로서의 자격을 1차로 얻을 수 있다고 볼 수 있음.
    
    # 1. timeline의 길이가 treshold_freq 보다 길어야 함.
    if len(timeline) < threshold_freq:
        return False
    
    # 2. 해당 기사의 총 댓글 수가 threshold_sum 보다 많아야 함.
    total = sum(list(timeline.values()))
    if total < threshold_sum:
        return False
    
    # 3. 가장 최근 댓글과 발행일 사이의 간격이 threshold_date 보다 길어야 함.
    published_date = datetime.datetime.strptime(list(timeline.keys())[0], '%Y%m%d')
    last_comment_date = datetime.datetime.strptime(list(timeline.keys())[-1], '%Y%m%d')
    term = (last_comment_date - published_date).days 
    if term < threshold_date:
        return False
    
    return True

def contributor_criteria (contributors, threshold_sum = 10, threshold_monopoly = 0.8):
    #댓글 참여자의 수가 너무 적거나 일부 참여자에게 독점이 심한 경우에는 성지 목록에서 제외함.
    total_conts = len(list(contributors.keys()))
    total_replies = sum(list(contributors.values()))
    if total_conts < threshold_sum:
        return False
    ordered_conts = sorted(contributors.items(), key = lambda x: x[1], reverse = True)
    if sum([count for (user, count) in ordered_conts[:5]]) / total_replies > threshold_monopoly:
        return False
    return True

# 전역변수 생성.
checker = defaultdict(dict)
threshold = 20

for a in tqdm.tqdm(json_list):
    article = json.loads(a)
    published_date = datetime.datetime.strptime(article['timestamp'][:10], '%Y-%m-%d')
    over_1_year = 0
    comment_timeline = defaultdict(int)
    contributors = defaultdict(int)
    for comment in article['comments']:
        # 타임라인 제작.
        timestamp = created_time(comment['createdAt'])
        comment_timeline[timestamp] += 1
        
        # 각 comment 별로 1년 넘은 comment인 지 확인.
        last_comment_date = datetime.datetime.strptime(comment['createdAt'][:10], '%Y-%m-%d')
        term = (last_comment_date - published_date).days
        if term > 365:
            over_1_year += 1
            
        # 댓글 작성자 기록을 담는 contributor dictionary 채우기
        writer = comment['userName']
        contributors[writer] += 1
    
    if timeline_criteria(comment_timeline) and contributor_criteria (contributors) and over_1_year > threshold:
        checker[article['gno']]['title'] = article['title']
        checker[article['gno']]['url'] = article['url']
        checker[article['gno']]['timestamp'] = article['timestamp']
        checker[article['gno']]['sid1s'] = article['sid1s']
        
        checker[article['gno']]['count'] = over_1_year
        checker[article['gno']]['percentage'] = round(over_1_year / len(article['comments']), 3)
        
        checker[article['gno']]['timeline'] = comment_timeline
        

100%|██████████| 277861/277861 [04:10<00:00, 1107.59it/s]


In [12]:
# 기존 checker 수
len(checker)

1142

In [16]:
# criteria 적용 후 checker 수
len(checker)

803

In [31]:
# criteria + contributor filtering 적용 후 checker 수
len(checker)

787

In [32]:
g = sorted(checker.items(), key = lambda x: x[1]['count'], reverse = True)

In [33]:
g

[('001,0005964326',
  {'title': "중증장애인 부부의 '아름다운 기부'",
   'url': 'https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=102&oid=001&aid=0005964326',
   'timestamp': '2012-11-30 19:04',
   'sid1s': ['102'],
   'count': 2747,
   'percentage': 0.996,
   'timeline': defaultdict(int,
               {'20121130': 4,
                '20121201': 3,
                '20121204': 3,
                '20161218': 8,
                '20161219': 4,
                '20161220': 10,
                '20161221': 6,
                '20161222': 3,
                '20161223': 5,
                '20161224': 7,
                '20161225': 12,
                '20161226': 5,
                '20161227': 6,
                '20161228': 5,
                '20161229': 3,
                '20161230': 7,
                '20161231': 6,
                '20170101': 3,
                '20170102': 5,
                '20170103': 10,
                '20170104': 5,
                '20170105': 5,
                '20170106': 4,

In [68]:
import plotly

x = [c[0] for c in g]
y = [c[1] for c in g]

fig = go.Figure()
fig.add_trace(go.Scatter(x = x, y = y, mode = 'lines'))
fig.update_layout(xaxis_type="log", yaxis_type="log")

plotly.offline.plot(fig, filename = '../pilgrim data/result/comments_after_1yr.html', auto_open=False)

'../pilgrim data/result/comments_after_1yr.html'

# ====================

In [4]:
# 추려낸 787개의 기사가 실제로 성지인가?
# 댓글들이 성지인 지를 확인하기 위해 각 댓글들이 진짜 성지인 지를 확인.
with open('../pilgrim data/result/pilgrim_candidates.json') as f:
    cands = json.load(f)

contents = defaultdict(list)
gnos = [cand[0] for cand in cands]
cands = {a[0]: a[1] for a in cands}

target_words = ['성지 순례', '성지순례']
real_pilgrims = []

for a in tqdm.tqdm(json_list):
    article = json.loads(a)
    if article['gno'] in gnos:
        cands[article['gno']]['is_pilgrim'] = 'NO'
        for comment in article['comments']:
            if any(x in comment['content'] for x in target_words):
                cands[article['gno']]['is_pilgrim'] = 'YES'
                real_pilgrims.append(())
                break

100%|██████████| 277861/277861 [01:22<00:00, 3349.62it/s]


In [5]:
print(len(real_pilgrims))
for article in real_pilgrims:
    print('Title: {} \n URL: {}'.format(article[1], article[2]))

174


IndexError: tuple index out of range

In [17]:
#20200212 --> CCDF로의 전환
with open('../pilgrim data/result/comments_after_1yr.json', 'r') as json_file:
    data = json.load(json_file)

In [18]:
data = sorted([(int(k),v) for k,v in data.items()], key = lambda x: x[0])

[(1, 226479),
 (2, 29227),
 (3, 9027),
 (4, 4026),
 (5, 2301),
 (6, 1456),
 (7, 896),
 (8, 694),
 (9, 522),
 (10, 438),
 (11, 327),
 (12, 279),
 (13, 205),
 (14, 173),
 (15, 146),
 (16, 135),
 (17, 101),
 (18, 97),
 (19, 89),
 (20, 94),
 (21, 76),
 (22, 65),
 (23, 67),
 (24, 46),
 (25, 46),
 (26, 35),
 (27, 40),
 (28, 35),
 (29, 33),
 (30, 25),
 (31, 35),
 (32, 23),
 (33, 13),
 (34, 23),
 (35, 24),
 (36, 32),
 (37, 19),
 (38, 21),
 (39, 21),
 (40, 17),
 (41, 12),
 (42, 14),
 (43, 16),
 (44, 5),
 (45, 12),
 (46, 8),
 (47, 8),
 (48, 10),
 (49, 7),
 (50, 15),
 (51, 4),
 (52, 8),
 (53, 9),
 (54, 11),
 (55, 5),
 (56, 6),
 (57, 6),
 (58, 9),
 (59, 6),
 (60, 2),
 (61, 9),
 (62, 6),
 (63, 6),
 (64, 5),
 (65, 7),
 (66, 7),
 (67, 3),
 (68, 5),
 (69, 5),
 (70, 2),
 (71, 5),
 (72, 4),
 (73, 5),
 (74, 6),
 (75, 1),
 (76, 8),
 (78, 2),
 (79, 6),
 (80, 1),
 (81, 1),
 (82, 4),
 (83, 4),
 (84, 3),
 (85, 2),
 (86, 1),
 (87, 2),
 (88, 5),
 (89, 4),
 (90, 2),
 (91, 2),
 (92, 1),
 (93, 3),
 (94, 3),
 (95, 

In [23]:
#20200212 --> CCDF로의 전환
with open('../pilgrim data/result/comments_after_1yr.json', 'r') as json_file:
    data = json.load(json_file)
data = sorted([(int(k),v) for k,v in data.items()], key = lambda x: x[0])

q = [v for k,v in data]
ccdf = np.cumsum(q[::-1])[::-1]

import plotly

x = [k for k,v in data]
y = ccdf

fig = go.Figure()
fig.add_trace(go.Scatter(x = x, y = y, mode = 'lines'))
fig.update_layout(xaxis_type="log", yaxis_type="log")

plotly.offline.plot(fig, filename = '../pilgrim data/result/comments_after_1yr.html', auto_open=False)

'../pilgrim data/result/comments_after_1yr.html'