In [1]:
import os
import glob
import re

from collections import namedtuple, OrderedDict

import time
from datetime import datetime, timedelta

import matplotlib.pyplot as plt

import numpy as np

import itertools

In [2]:
from tqdm.notebook import tqdm

In [3]:
from collections import Counter

In [4]:
log_file_list = glob.glob('../chats/*.log')

In [5]:
print(log_file_list)

['../chats/559066430.log', '../chats/551734269.log', '../chats/562675179.log', '../chats/551240626.log', '../chats/564796691.log', '../chats/552238357.log', '../chats/582902383.log', '../chats/583815634.log']


In [38]:
# log_file = log_file_list[-2]
# log_file = log_file_list[-7]
log_file = log_file_list[0]

In [39]:
Chat = namedtuple('Chat', 'time id text')

In [40]:
t0 = datetime.strptime('00:00:00','%H:%M:%S')

In [41]:
chat_history = list()

with open(log_file, 'r') as file:
    
    count = 0
    for chat in file:
#     chat = next(file).strip()
#         print(chat.strip())
        chat_time = re.search('\[.*?\]', chat)[0]
        user_id_match = re.search('<.*?>', chat)
        user_id = user_id_match[0]
        start_of_chat = user_id_match.span()[-1]
        text = chat[start_of_chat:].strip()
        
        dt = datetime.strptime(chat_time,'[%H:%M:%S]') - t0
#         print(dt.total_seconds())
        
        chat_tuple = Chat(dt.total_seconds(), user_id[1:-1], text)

#         print('{}'.format(chat_tuple))
        chat_history.append(chat_tuple)
        

In [44]:
def text_cleaner(text, extra_words = []):
    
    words = [' ', ',', '.', '-', '~'] + extra_words
    
    for word in words:
        text = text.replace(word, '')

    common_typos = [('ㅋㅌ', 'ㅋㅋ'),
                    ('ㅌㅋ', 'ㅋㅋ'),
                    ('ㅋㄱ', 'ㅋㅋ'),
                    ('ㄱㅋ', 'ㅋㅋ'),
                    ('zㅋ', 'ㅋㅋ'),
                    ('ㅋz', 'ㅋㅋ'),
                   ('ehgk', '도하'),
                   ('ehqk', '도바'),
                    ('ㅏ', '아'),
#                    ('귀네', '커비')
                   ]
    
    for typo, ans in common_typos:
        text = text.replace(typo, ans)

        
    return text

def text_compressor(text, com_words_extra = [], min_repeat = 2):
    
    com_words = ['ㅋ', '아', 'z', 'ㅗ', 'ㅔ',
                 '?', 'ㄷ', 'ㅠ', 'ㅜ', 'ㅏ', 
                 '!', 'ㄴ', 'ㅊ', 'ㅅ', '오', 
                 '또', 'ㅓ', '<', '>', ';', 
                 'ㅣ', 'ㅎ', '어', '5', 'w',
                'ㄸ', 'ㅖ'] + com_words_extra
    
#     print(com_words)
    
    for word in com_words:
        
#         print(text.count(word))
        
        if word in text:
            for repeat in reversed(range(min_repeat + 1, text.count(word) + 1)):
#                 len_before = len(text)
                text = text.replace(repeat * word, word * min_repeat)
#                 len_after = len(text)
                
                if text.count(word) <= min_repeat:
                    break

    return text

def counter_compressor(text_counter):
    
    sorted_pairs = text_counter.most_common()
    
    key_to_remove = list()
    
    for i, key, count in enumerate(sorted_pairs):
        for j, key_c, count_c in enumerate(sorted_pairs[i+1:]):
            if key in key_c:
                sorted_pairs[i][1] = 0
                break
            
    
    return text_counter

def counter_cleaner(text_counter, min_occur = 2):
    
    sorted_pairs = text_counter.most_common()
    
#     key_to_remove = list()
    
    for i, (key, count) in enumerate(reversed(sorted_pairs)):
        if count < min_occur:
            del text_counter[key]
        else:
            break
            
    return text_counter

In [45]:
system_words = [
    'subscribed at Tier',
    "They've subscribed for",
    "!업타임",
    'months, currently on a' 'month streak!',
    '채팅/후원 어그로 뿐만 아니라 어그로에 끌려도 제재합니다! 악성 채팅은 꼭 따로 이메일 부탁드립니다. exdeath573@naver.com',
    'subscribed with Twitch Prime.',
]

trending_words = [
    'ㅗㅜㅑ',
    """("'ω'`)""",
    "('ω'`)",
    '도하',
    '도바',
    'ㄷㄷㄷㅈ',
    'ㄹㅇㅋㅋ',
]

emoticon_words = [
    'kimdoeG_SQ',
    'kimdoeDH_BW',
    'kimdoeCVM_BW',
    'kimdoeCVM_SQ',
    'kimdoeDH_SQ',
    'kimdoeGT_SQ',
    'jadongOH_BW',
    'kimdoeDH_SQ',
    'ymwFlurry1_SQ',
    'kimdoeWOW',
    'kimdoeDH',
    'kimdoeOMG',
    'kimdoeRKK', 
    'kimdoeNB',
    'kimdoeCVM',
    'kimdoeCAT',
    'the8bitWink',
    'kimdoeOMG',
    'OWL2019clap',
    'BloodTrail',
    'kimdoePE',
    'kimdoeHM',
    'kimdoeGRT',
    'kimdoeCHOCO',
    'kimdoeOOPS',
    'kimdoeBC',
    'kimdoeBC2',
    'kimdoeKB',
    'kimdoeBC1',
    'kimdoeMS', 
    'kimdoeTAK',
    'kimdoeOOPS',
    'kimdoeD',
    'kimdoeNB',
    'kimdoeV',
    'poongPig',
    'kimdoeIBR',
    'kimdoeGRT',
    'kimdoeG',
    'kimdoeV',
    'kimdoeWA',
    'Stream uptime:',
    'kimdoeFN',
    'kimdoeBED',
    'kimdoeAA',
    'kimdoeWK',
    'kimdoeMUSIN',
    'ResidentSleeper',
    'kimdoeKO',
    'kimdoeHK',
    'kimdoePMC',
    'kimdoeAPG',
    'BibleThump',
    
]

words = system_words + trending_words + emoticon_words

words = list(reversed(sorted(words, key = lambda word: len(word))))

keywords = [
    '미안하다 이거 보여주려고 어그로끌었다',
    '진짜 세계관최강자들의 싸움이다',
    '그럼 당신이 신이라는 거야?',
    '아아! 그렇다!',
    '대각선으로 쏘라고!',
    '제가 빡대가리였습니다!'
]

keywords_clean = list(map(text_cleaner, keywords))

maximum_keyword_len = max(map(len, keywords_clean))
maximum_keyword_len = 2

total_counter = Counter()

for num, chat in enumerate(tqdm(chat_history)):
    
    text = getattr(chat, 'text')
#     print(text, end = '\n')
#     print(text, end = '\n')
#     print(text)
#     print(text_no_space)
#     print(len(text))

#     if 'SQ' in text or 'BW' in text:
#         for w in ['kimdoeG_SQ', 'kimdoeDH_BW', 'kimdoeCVM_BW', 'kimdoeCVM_SQ']:
#             if w in text:
#                 break
#         else:
#             print(text)

    word_dict = dict()

    for word in words:
        if word in text:
            num_occur = text.count(word)
            text = text.replace(word, '')
            word_dict[word] = num_occur
            
    text = text_compressor(text)
    text = text_cleaner(text)
    
    for word in words:
        if word in text:
            num_occur = text.count(word)
            text = text.replace(word, '')
            word_dict[word] = num_occur
            
    word_counter = Counter(word_dict)
    
#     print(text)
#     print()

    text = text_cleaner(text, ['!', '?'])

    sentence_dict = dict()
    
    if len(text) > 0:

        for i in range(max(1, maximum_keyword_len), len(text) + 1):
    #         for j in range(0, len(text_no_space) - i + 1):
            for j in range(0, min(len(text) - i + 1, maximum_keyword_len)):
                sentence_dict[text[j:j+i]] = 1
    
#     print(text_no_space)
    
    sentence_counter = Counter(sentence_dict)
    
#     print(sentence_counter)

#     print(word_counter)
    
    total_counter += word_counter
    total_counter += sentence_counter
    
#     print(total_counter)

    if (num + 1) % 1000 == 0:
        total_counter = counter_cleaner(total_counter)
        
#     if num > 50000:
#         print(total_counter.most_common(400))
#         break

print([(word, num) for word, num in total_counter.most_common(600) 
       if word not in system_words and word not in emoticon_words])

HBox(children=(FloatProgress(value=0.0, max=46733.0), HTML(value='')))


[('ㅋㅋ', 6108), ('ㄷㄷ', 913), ('ㅗㅜㅑ', 645), ('ㄹㅇ', 609), ('김도', 486), ('야끼', 372), ('이걸', 292), ('도바', 282), ('역시', 264), ('ㄹㅇㅋㅋ', 228), ('ㄴㅂ', 198), ('아아', 193), ('끼런', 187), ('ㅠㅠ', 182), ('역시게', 178), ('시게', 177), ('역시게장', 176), ('역시게장이', 176), ('시게장', 175), ('시게장이', 175), ('역시게장이야', 174), ('시게장이야', 173), ('존버', 164), ('무신', 163), ('가방', 163), ('ㅇㄱ', 159), ('커비', 156), ('그냥', 154), ('머리', 153), ('ㅅㅅ', 149), ('야끼런', 149), ('ㄴㅇ', 141), ('이제', 140), ('자동', 140), ('진짜', 140), ('아니', 139), ('레이', 139), ('ㄴㅇㄱ', 132), ('제발', 131), ('도하', 127), ('캐브', 123), ('무새', 123), ('근데', 118), ('이게', 116), ('스캐', 109), ('ㄱㄹ', 107), ('저격', 105), ('ㄱㄹㅇ', 103), ('이익', 103), ('이더', 102), ('레이더', 102), ('기차', 102), ('아그', 101), ('<<', 100), ('스캐브', 97), ('ㅁㅈ', 96), ('그거', 96), ('그건', 96), ('익실', 94), ('익실현', 94), ('ㅖㅖ', 94), ('이익실', 93), ('이익실현', 93), ('그래', 91), ('오늘', 90), ('똘삼', 90), ('10', 85), ('나가', 84), ('저거', 84), ('ㅇㅇ', 83), ('여기', 83), ('노잼', 83), ('지금', 82), ('그는', 82), ('손실', 80), ('도끼', 80), ('아

In [12]:
# print(total_counter.most_common(800))