### Imports

In [1]:
import codecs
import re
import time

### Parameters

In [2]:
length_minimum = 5
set_minimum = 5
whitelist_regex = re.compile(r"([^!@#$%^&*(),.?\":{}|`~<>\[\]\-_=+\\/;'a-zA-Z0-9\u3131-\u318E\uAC00-\uD7A3\u1100-\u11f9\s]|http)")
normalize_regexs = [(re.compile(r"([!@#$%^&*(),.?\":{}|`~<>\[\]\-_=+\\/;'\u3131-\u318E\uAC00-\uD7A3])(\1{2})\1+"), "\\1\\2"),
                    (re.compile(r"[\u314C\u314B]{2,}"), "\u314B"), # zㅌㅋ -> "ㅋ"
                    (re.compile(r"[\u3154\u3156]{2,}"), "\u314B"), # ㅔㅖ -> "ㅔ"
                    (re.compile(r"[\u3160\u315C]{2,}"), "\u3160"), # ㅠㅜ -> "ㅠ"
                    (re.compile(r"(!|\?)\1+"), "\\1"), # ?, ! -> "?", "!"
                    (re.compile(r"(\.|\,|\;)\1+"), "\\1\\1")] # '.', ',', ';' -> "..", ",,", ";;"

### Utility Functions

In [3]:
def get_chats(filename):
    with open(filename, 'r', encoding='utf-8-sig') as data_file:
        return data_file.readlines()

In [4]:
def is_system_message(sentence):
    if "subscribe" in sentence:
        return True
    else:
        return False

In [5]:
def is_simple_sentence(sentence):
    characters = set()
    for char in sentence:
        if char not in characters:
            characters.add(char)
            if len(characters) >= set_minimum:
                return False
    return True

In [6]:
def is_question(sentence):
    if "?" not in sentence:
        return False
    if sentence[0] == "?" or sentence[1] == "?":
        return False
    return True

In [7]:
def is_contain_other_character(sentence):
    if whitelist_regex.search(sentence) is None:
        return False
    else:
        return True

In [8]:
def is_duplicated(sentence, sentence_set):
    if sentence in sentence_set:
        return True
    else:
        return False

In [9]:
def normalize_sentence(sentence):
    processing_time = [0] * len(normalize_regexs)
    normalized_count = [0] * len(normalize_regexs)
    for idx, regex in enumerate(normalize_regexs):
        routine_start_time = time.time()
        sentence, normalized_count[idx] = regex[0].subn(regex[1], sentence)
        processing_time[idx] = time.time() - routine_start_time
    return sentence, processing_time, [1 if sum(normalized_count) > 0 else 0, normalized_count]

In [10]:
def clean_chats(chats):
    start_time = time.time()
    processing_time = {length: 0, system: 0, simple: 0, filter: 0, normalize: [0] * len(normalize_regexs), duplicate: 0, question: 0]

    chat_set = set()
    chat_list = list()
    question_list = list()

    skipped_chat_count = {filter: 0, length: 0, system: 0, simple: 0, duplicate: 0}  # filter, length, system, simple, duplicate
    processed_chat_count = [chat: 0, question: 0  # chat, question
    normalized_count = [0, [0] * len(normalize_regexs)]

    for chat in chats:
        if chat[-1] == "\n":
            chat = chat[:-1]

        # 1. check length
        routine_start_time = time.time()
        if len(chat) < length_minimum:
            skipped_chat_count[1] += 1
            continue
        processing_time[0] += time.time() - routine_start_time

        # 2. check system message
        routine_start_time = time.time()
        if is_system_message(chat):
            skipped_chat_count[2] += 1
            continue
        processing_time[1] += time.time() - routine_start_time

        # 3. check simple sentence
        routine_start_time = time.time()
        if is_simple_sentence(chat):
            skipped_chat_count[3] += 1
            continue
        processing_time[2] += time.time() - routine_start_time

        # 0. filter characters
        routine_start_time = time.time()
        if is_contain_other_character(chat):
            skipped_chat_count[0] += 1
            continue
        processing_time[3] += time.time() - routine_start_time
        
        # Normalize sentence
        chat, normalize_time, count = normalize_sentence(chat)
        normalized_count[0] += count[0]
        for idx, val in enumerate(normalize_time):
            processing_time[4][idx] += val
        for idx, val in enumerate(count[1]):
            normalized_count[1][idx] += val

        # 4. check duplication
        routine_start_time = time.time()
        if is_duplicated(chat, chat_set):
            skipped_chat_count[4] += 1
            continue
        processing_time[5] += time.time() - routine_start_time

        chat_set.add(chat)
        chat_list.append(chat + "\n")
        processed_chat_count[0] += 1

        # 5. check question
        routine_start_time = time.time()
        if not is_question(chat):
            continue
        processing_time[6] += time.time() - routine_start_time

        question_list.append(chat + "\n")
        processed_chat_count[1] += 1

    return chat_list, question_list, [time.time() - start_time, processing_time], [skipped_chat_count, processed_chat_count, normalized_count]

In [11]:
def save_chats_and_questions(filename, chat_list, question_list):
    filename_split = filename.split(".")
    chat_filename = ".".join(filename_split[:-1]) + "_chat." + filename_split[-1]
    question_filename = ".".join(filename_split[:-1]) + "_question." + filename_split[-1]

    with codecs.open(chat_filename, 'w', encoding='utf-8') as output_file:
        output_file.writelines(chat_list)
    with codecs.open(question_filename, 'w', encoding='utf-8') as output_file:
        output_file.writelines(question_list)

In [12]:
def analyze_result(processing_time, count):
    total_time = processing_time[0]
    routine_time = processing_time[1]
    
    skipped = count[0]
    processed = count[1]
    normalized = count[2]

    total_skipped = sum(skipped)
    total = total_skipped + processed[0]
    total_normalized = sum(normalized[1])

    
    print("")
    print("Total processing time: %.3f secs" % (total_time))
    print("0. Check length     : %.3f secs" % (routine_time[0]))
    print("1. Check system msg : %.3f secs" % (routine_time[1]))
    print("2. Check simplicity : %.3f secs" % (routine_time[2]))
    print("3. Check char filter: %.3f secs" % (routine_time[3]))
    print("4. Normalize chat   : %.3f secs" % (sum(routine_time[4])))
    print("    0. Normalize general: %.3f secs" % (routine_time[4][0]))
    print("    1. Normalize ㅋㅋ    : %.3f secs" % (routine_time[4][1]))
    print("    2. Normalize ㅔㅖ    : %.3f secs" % (routine_time[4][2]))
    print("    3. Normalize ㅠㅜ    : %.3f secs" % (routine_time[4][3]))
    print("    4. Normalize ?!     : %.3f secs" % (routine_time[4][4]))
    print("    5. Normalize .,;    : %.3f secs" % (routine_time[4][5]))
    print("5. Check duplication: %.3f secs" % (routine_time[5]))
    print("6. Check question   : %.3f secs" % (routine_time[0]))
    print("")
    print("Total chats: %d" % total)
    print("Skipped chats: %d (%.2f%%)" % (total_skipped, total_skipped / total * 100))
    print("  0. Skipped chats by char filter : %d (%.2f%%)" % (skipped[0], skipped[0] / total_skipped * 100))
    print("  1. Skipped chats by short length: %d (%.2f%%)" % (skipped[1], skipped[1] / total_skipped * 100))
    print("  2. Skipped chats by system msg  : %d (%.2f%%)" % (skipped[2], skipped[2] / total_skipped * 100))
    print("  3. Skipped chats by simplicity  : %d (%.2f%%)" % (skipped[3], skipped[3] / total_skipped * 100))
    print("  4. Skipped chats by duplication : %d (%.2f%%)" % (skipped[4], skipped[4] / total_skipped * 100))
    print("Processed chats: %d (%.2f%%)" % (processed[0], processed[0] / total * 100))
    print("  Processed normal chats: %d (%.2f%%)" % (processed[0] - processed[1], 100 - (processed[1] / processed[0] * 100)))
    print("  Processed questions   : %d (%.2f%%)" % (processed[1], processed[1] / processed[0] * 100))
    print("")
    print("Normalized chats: %d (%.2f%%), total %d words" % (normalized[0], normalized[0] / processed[0] * 100, total_normalized))
    print("  0. Normalized general: %d (%.2f%%)" % (normalized[1][0], normalized[1][0] / total_normalized * 100))
    print("  1. Normalized ㅋㅋ    : %d (%.2f%%)" % (normalized[1][1], normalized[1][1] / total_normalized * 100))
    print("  2. Normalized ㅔㅖ    : %d (%.2f%%)" % (normalized[1][2], normalized[1][2] / total_normalized * 100))
    print("  3. Normalized ㅠㅜ    : %d (%.2f%%)" % (normalized[1][3], normalized[1][3] / total_normalized * 100))
    print("  4. Normalized ?!     : %d (%.2f%%)" % (normalized[1][4], normalized[1][4] / total_normalized * 100))
    print("  5. Normalized .,;    : %d (%.2f%%)" % (normalized[1][5], normalized[1][5] / total_normalized * 100))

### Main

In [1]:
filename = input("filename: ")
chats = get_chats(filename)
chat_list, question_list, processing_time, count = clean_chats(chats)
save_chats_and_questions(filename, chat_list, question_list)
analyze_result(processing_time, count)

filename:  ../../Data/1to100.txt

Total processing time: 557.858 secs
0. Check length     : 7.885 secs
1. Check system msg : 11.159 secs
2. Check simplicity : 45.809 secs
3. Check char filter: 28.469 secs
4. Normalize chat   : 249.004 secs
  0. Normalize general: 83.430 secs
  1. Normalize ㅋㅋ    : 25.612 secs
  2. Normalize ㅔㅖ    : 23.766 secs
  3. Normalize ㅠㅜ    : 23.208 secs
  4. Normalize ?!     : 46.105 secs
  5. Normalize .,;    : 46.883 secs
5. Check duplication: 19.531 secs
6. Check question   : 7.885 secs

Total chats: 49537148
Skipped chats: 30616206 (61.80%)
  0. Skipped chats by char filter : 414105 (1.35%)
  1. Skipped chats by short length: 15496703 (50.62%)
  2. Skipped chats by system msg  : 41955 (0.14%)
  3. Skipped chats by simplicity  : 9794111 (31.99%)
  4. Skipped chats by duplication : 4869332 (15.90%)
Processed chats: 18920942 (38.20%)
  Processed normal chats: 16956693 (89.62%)
  Processed questions   : 1964249 (10.38%)

Normalized chats: 5254033 (27.77%), tota