In [None]:
from main import *

In [9]:
from glob import glob
from bs4 import BeautifulSoup
from tqdm import tqdm
from html2text import html2text
from collections import defaultdict
from fuzzywuzzy import fuzz
from time import sleep
import pickle
from difflib import SequenceMatcher
import re
import pandas as pd
import os


def split_text(text):
    return re.findall(r'\b\w[\w\'-]*\b|\s+|[.,!?;]', text)


def highlight_differences(old_str, new_str):
    old_words = split_text(old_str)
    new_words = split_text(new_str)

    matcher = SequenceMatcher(None, old_words, new_words)
    diff = list(matcher.get_opcodes())

    highlighted_old = []
    highlighted_new = []

    for opcode, a1, a2, b1, b2 in diff:
        if opcode in ('replace', 'insert'):
            highlighted_new.extend(['<span style="color: green;">{}</span>'.format(word) for word in new_words[b1:b2]])
        if opcode in ('replace', 'delete'):
            highlighted_old.extend(['<span style="color: red;">{}</span>'.format(word) for word in old_words[a1:a2]])
        else:
            highlighted_old.extend(old_words[a1:a2])
            highlighted_new.extend(new_words[b1:b2])

    highlighted_old_str = ''.join(highlighted_old)
    highlighted_new_str = ''.join(highlighted_new)

    return highlighted_old_str, highlighted_new_str


def read_patch_dialogs(patch_former, patch_latter):
    dialogue_files_former = list(glob(f"data/{patch_former}/**/*.html", recursive=True))
    dialogue_files_latter = list(glob(f"data/{patch_latter}/**/*.html", recursive=True))

    former_lines_dict = defaultdict(set)
    latter_lines_dict = defaultdict(set)

    f_lines_count = 0
    for fn in tqdm(dialogue_files_former, desc=f"Processing former patch, files processed"):
        file = open(fn, "r", encoding='latin-1')
        index = file.read()
        source = BeautifulSoup(index, 'lxml')
        div_elements = source.find_all('div', class_="npc")
        for div_element in div_elements:
            character = div_element.text
            dialog_span = div_element.find_next('span', class_='dialog')
            former_lines_dict[character].add(html2text(str(dialog_span)).strip())
            f_lines_count += 1

    l_lines_count = 0
    for fn in tqdm(dialogue_files_latter, desc=f"Processing latter patch, files processed"):
        file = open(fn, "r", encoding='latin-1')
        index = file.read()
        source = BeautifulSoup(index, 'lxml')
        div_elements = source.find_all('div', class_="npc")
        for div_element in div_elements:
            character = div_element.text
            dialog_span = div_element.find_next('span', class_='dialog')
            latter_lines_dict[character].add(html2text(str(dialog_span)).strip())
            l_lines_count += 1

    return former_lines_dict, latter_lines_dict


def calculate_differences(former_patch_name, latter_patch_name, former_lines_dict, latter_lines_dict):
    all_characters = set(list(former_lines_dict.keys()) + list(latter_lines_dict.keys()))
    char_lines_dict = defaultdict(dict)
    for character in tqdm(all_characters, desc="Finding differences per character"):
        lines = [l for l in latter_lines_dict[character] if l not in former_lines_dict[character]]
        new_lines = set()
        changed_lines = []
        for line in lines:
            for old_line in former_lines_dict[character]:
                if fuzz.ratio(line, old_line) > 80:
                    changed_lines.append((old_line, line))
                    break
            else:
                new_lines.add(line)
                
        if len(changed_lines) > 0 or len(new_lines) > 0:
            char_lines_dict[character]["changed"] = changed_lines
            char_lines_dict[character]["new"] = new_lines
    
    with open(f"cache/cd_{former_patch_name}-{latter_patch_name}.pkl", 'wb') as file:
        pickle.dump(char_lines_dict, file)
            

patches = [v.replace("\\", '/').split('/')[1] for v in glob("data/*") if "Parser" not in v]
patches.sort()
former_patch_index = 0    

for i in range(len(patches)):
    for j in range(i + 1, len(patches)):
        former_patch = patches[i]
        latter_patch = patches[j]
        former_patch_name = former_patch.split(' - ')[0]
        latter_patch_name = latter_patch.split(' - ')[0]
        if os.path.isfile(f"cache/cd_{former_patch_name}-{latter_patch_name}.pkl"):
            continue
        
        former, latter = read_patch_dialogs(former_patch, latter_patch)
        calculate_differences(former_patch_name, latter_patch_name, former, latter)

Processing former patch, files processed: 100%|██████████| 9208/9208 [00:46<00:00, 198.82it/s]
Processing latter patch, files processed: 100%|██████████| 9218/9218 [00:46<00:00, 196.93it/s]
Finding differences per character: 100%|██████████| 1991/1991 [00:12<00:00, 159.35it/s]
Processing former patch, files processed: 100%|██████████| 9208/9208 [00:46<00:00, 199.77it/s]
Processing latter patch, files processed: 100%|██████████| 9231/9231 [00:50<00:00, 183.55it/s]
Finding differences per character: 100%|██████████| 1996/1996 [01:35<00:00, 20.88it/s] 
Processing former patch, files processed: 100%|██████████| 9208/9208 [00:44<00:00, 205.99it/s]
Processing latter patch, files processed: 100%|██████████| 9234/9234 [00:50<00:00, 183.52it/s]
Finding differences per character: 100%|██████████| 1996/1996 [04:57<00:00,  6.71it/s]
Processing former patch, files processed: 100%|██████████| 9218/9218 [00:45<00:00, 204.01it/s]
Processing latter patch, files processed: 100%|██████████| 9231/9231 [00