In [None]:
from main import *

In [5]:
from glob import glob
from bs4 import BeautifulSoup
from tqdm import tqdm
from html2text import html2text
from collections import defaultdict
from fuzzywuzzy import fuzz
from time import sleep
import pickle
from difflib import SequenceMatcher
import re
import pandas as pd
import os


def split_text(text):
    return re.findall(r'\b\w[\w\'-]*\b|\s+|[.,!?;]', text)


def highlight_differences(old_str, new_str):
    old_words = split_text(old_str)
    new_words = split_text(new_str)

    matcher = SequenceMatcher(None, old_words, new_words)
    diff = list(matcher.get_opcodes())

    highlighted_old = []
    highlighted_new = []

    for opcode, a1, a2, b1, b2 in diff:
        if opcode in ('replace', 'insert'):
            highlighted_new.extend(['<span style="color: green;">{}</span>'.format(word) for word in new_words[b1:b2]])
        if opcode in ('replace', 'delete'):
            highlighted_old.extend(['<span style="color: red;">{}</span>'.format(word) for word in old_words[a1:a2]])
        else:
            highlighted_old.extend(old_words[a1:a2])
            highlighted_new.extend(new_words[b1:b2])

    highlighted_old_str = ''.join(highlighted_old)
    highlighted_new_str = ''.join(highlighted_new)

    return highlighted_old_str, highlighted_new_str


def read_patch_dialogs(patch_former, patch_latter):
    dialogue_files_former = list(glob(f"data/{patch_former}/**/*.html", recursive=True))
    dialogue_files_latter = list(glob(f"data/{patch_latter}/**/*.html", recursive=True))

    former_lines_dict = defaultdict(set)
    latter_lines_dict = defaultdict(set)

    f_lines_count = 0
    for fn in tqdm(dialogue_files_former, desc=f"Processing former patch, files processed"):
        file = open(fn, "r", encoding='latin-1')
        index = file.read()
        source = BeautifulSoup(index, 'lxml')
        div_elements = source.find_all('div', class_="npc")
        for div_element in div_elements:
            character = div_element.text
            dialog_span = div_element.find_next('span', class_='dialog')
            former_lines_dict[character].add(f"{fn}||||{html2text(str(dialog_span)).strip()}")
            f_lines_count += 1

    l_lines_count = 0
    for fn in tqdm(dialogue_files_latter, desc=f"Processing latter patch, files processed"):
        file = open(fn, "r", encoding='latin-1')
        index = file.read()
        source = BeautifulSoup(index, 'lxml')
        div_elements = source.find_all('div', class_="npc")
        for div_element in div_elements:
            character = div_element.text
            dialog_span = div_element.find_next('span', class_='dialog')
            latter_lines_dict[character].add(f"{fn}||||{html2text(str(dialog_span)).strip()}")
            l_lines_count += 1

    return former_lines_dict, latter_lines_dict


def calculate_differences(former_patch_name, latter_patch_name, former_lines_dict, latter_lines_dict):
    all_characters = set(list(former_lines_dict.keys()) + list(latter_lines_dict.keys()))
    char_lines_dict = defaultdict(dict)
    for character in tqdm(all_characters, desc="Finding differences per character"):
        lines = [l for l in latter_lines_dict[character] if l not in former_lines_dict[character]]
        new_lines = []
        changed_lines = []
        for line_info in lines:
            fn, line = line_info.split("||||")
            for old_line_info in former_lines_dict[character]:
                old_fn, old_line = old_line_info.split("||||")
                bn, old_bn = os.path.basename(fn), os.path.basename(old_fn)
                if fuzz.ratio(line, old_line) > 80 and bn == old_bn:
                    changed_lines.append({"fn": fn, "lines": (old_line, line)})
                    break
            else:
                new_lines.append({"fn": fn, "line": line})
                
        if len(changed_lines) > 0 or len(new_lines) > 0:
            char_lines_dict[character]["changed"] = changed_lines
            char_lines_dict[character]["new"] = new_lines
    
    with open(f"cache/cd_{former_patch_name}-{latter_patch_name}.pkl", 'wb') as file:
        pickle.dump(char_lines_dict, file)
            

patches = [v.replace("\\", '/').split('/')[1] for v in glob("data/*") if "Parser" not in v]
patches.sort()
former_patch_index = 0    

for i in range(len(patches)):
    for j in range(i + 1, len(patches)):
        former_patch = patches[i]
        latter_patch = patches[j]
        former_patch_name = former_patch.split(' - ')[0]
        latter_patch_name = latter_patch.split(' - ')[0]
        print(former_patch_name, latter_patch_name)
        if os.path.isfile(f"cache/cd_{former_patch_name}-{latter_patch_name}.pkl"):
            continue
        
        former, latter = read_patch_dialogs(former_patch, latter_patch)
        calculate_differences(former_patch_name, latter_patch_name, former, latter)
        break

1.0 1.2


Processing former patch, files processed: 100%|██████████| 9208/9208 [01:42<00:00, 90.23it/s] 
Processing latter patch, files processed: 100%|██████████| 9218/9218 [01:43<00:00, 88.64it/s] 
Finding differences per character:  65%|██████▍   | 1290/1991 [24:24<13:15,  1.14s/it] 


KeyboardInterrupt: 

In [4]:
from glob import glob
from bs4 import BeautifulSoup
from tqdm import tqdm
from html2text import html2text
from collections import defaultdict
from fuzzywuzzy import fuzz
from time import sleep
import pickle
from difflib import SequenceMatcher
import re
import pandas as pd
import os
from joblib import Parallel, delayed
import pickle



def split_text(text):
    return re.findall(r'\b\w[\w\'-]*\b|\s+|[.,!?;]', text)


def highlight_differences(old_str, new_str):
    old_words = split_text(old_str)
    new_words = split_text(new_str)

    matcher = SequenceMatcher(None, old_words, new_words)
    diff = list(matcher.get_opcodes())

    highlighted_old = []
    highlighted_new = []

    for opcode, a1, a2, b1, b2 in diff:
        if opcode in ('replace', 'insert'):
            highlighted_new.extend(['<span style="color: green;">{}</span>'.format(word) for word in new_words[b1:b2]])
        if opcode in ('replace', 'delete'):
            highlighted_old.extend(['<span style="color: red;">{}</span>'.format(word) for word in old_words[a1:a2]])
        else:
            highlighted_old.extend(old_words[a1:a2])
            highlighted_new.extend(new_words[b1:b2])

    highlighted_old_str = ''.join(highlighted_old)
    highlighted_new_str = ''.join(highlighted_new)

    return highlighted_old_str, highlighted_new_str


def read_patch_dialogs(patch_former, patch_latter):
    dialogue_files_former = list(glob(f"data/{patch_former}/**/*.html", recursive=True))
    dialogue_files_latter = list(glob(f"data/{patch_latter}/**/*.html", recursive=True))


    
    if not os.path.isfile(f"data/read_caches/{patch_former}_former.pkl"):
        former_lines_dict = defaultdict(set)
        former_lines_info_dict = defaultdict(set)
        
        f_lines_count = 0
        for fn in tqdm(dialogue_files_former, desc=f"Processing former patch, files processed"):
            file = open(fn, "r", encoding='latin-1')
            index = file.read()
            source = BeautifulSoup(index, 'lxml')
            div_elements = source.find_all('div', class_="npc")
            for div_element in div_elements:
                character = div_element.text
                dialog_span = div_element.find_next('span', class_='dialog')
                former_lines_dict[character].add(html2text(str(dialog_span)).strip())
                former_lines_info_dict[character].add(f"{fn}||||{html2text(str(dialog_span)).strip()}")
                f_lines_count += 1

        former_pickle_dict = {"f_lines_count": f_lines_count, 
                              "former_lines_dict": former_lines_dict, 
                              "former_lines_info_dict": former_lines_info_dict}
        with open(f"data/read_caches/{patch_former}_former.pkl", 'wb') as pickle_file:
            pickle.dump(former_pickle_dict, pickle_file)
    else:
        with open(f"data/read_caches/{patch_former}_former.pkl", 'rb') as pickle_file:
            former_pickle_dict = pickle.load(pickle_file)
        f_lines_count, former_lines_dict, former_lines_info_dict = (former_pickle_dict["f_lines_count"],
                                                                   former_pickle_dict["former_lines_dict"],
                                                                   former_pickle_dict["former_lines_info_dict"])

    if not os.path.isfile(f"data/read_caches/{patch_latter}_latter.pkl"):
        latter_lines_dict = defaultdict(set)
        
        l_lines_count = 0
        for fn in tqdm(dialogue_files_latter, desc=f"Processing latter patch, files processed"):
            file = open(fn, "r", encoding='latin-1')
            index = file.read()
            source = BeautifulSoup(index, 'lxml')
            div_elements = source.find_all('div', class_="npc")
            for div_element in div_elements:
                character = div_element.text
                dialog_span = div_element.find_next('span', class_='dialog')
                latter_lines_dict[character].add(f"{fn}||||{html2text(str(dialog_span)).strip()}")
                l_lines_count += 1
                
        latter_pickle_dict = {"l_lines_count": l_lines_count, 
                              "latter_lines_dict": latter_lines_dict}
        with open(f"data/read_caches/{patch_latter}_latter.pkl", 'wb') as pickle_file:
            pickle.dump(latter_pickle_dict, pickle_file)
    else:
        with open(f"data/read_caches/{patch_latter}_latter.pkl", 'rb') as pickle_file:
            latter_pickle_dict = pickle.load(pickle_file)
        l_lines_count, latter_lines_dict = (latter_pickle_dict["l_lines_count"],
                                           latter_pickle_dict["latter_lines_dict"])

    return former_lines_info_dict, former_lines_dict, latter_lines_dict


def calculate_differences(former_patch_name, latter_patch_name, former_lines_info_dict, former_lines_dict, latter_lines_dict):
    all_characters = set(list(former_lines_dict.keys()) + list(latter_lines_dict.keys()))
    char_lines_dict = defaultdict(dict)
    
    def process_line(index, line_info, former_lines_info_dict, character):
        fn, line = line_info.split("||||")
        for old_line_info in former_lines_info_dict[character]:
            old_fn, old_line = old_line_info.split("||||")
            bn, old_bn = os.path.basename(fn), os.path.basename(old_fn)
            if fuzz.ratio(line, old_line) > 80 and bn == old_bn:
                return index, {"type": "changed", "fn": fn, "lines": (old_line, line)}
        return index, {"type": "new", "fn": fn, "line": line}
    
    for character in tqdm(all_characters, desc="Finding differences per character"):
        lines = [l for l in latter_lines_dict[character] if l.split("||||")[1] not in former_lines_dict[character]]
        results = Parallel(n_jobs=-1)(delayed(process_line)(i, line_info, former_lines_info_dict, character) for i, line_info in enumerate(lines))
        new_lines = [result[1] for result in sorted(results) if result[1]["type"] == "new"]
        changed_lines = [result[1] for result in sorted(results) if result[1]["type"] == "changed"]

        if len(changed_lines) > 0 or len(new_lines) > 0:
            char_lines_dict[character]["changed"] = changed_lines
            char_lines_dict[character]["new"] = new_lines
    
    with open(f"cache/cd_{former_patch_name}-{latter_patch_name}.pkl", 'wb') as file:
        pickle.dump(char_lines_dict, file)
            

patches = [v.replace("\\", '/').split('/')[1] for v in glob("data/*") if all(["Parser" not in v,
                                                                              "read_caches" not in v])]
patches.sort()
former_patch_index = 0    

for i in range(len(patches)):
    for j in range(i + 1, len(patches)):
        former_patch = patches[i]
        latter_patch = patches[j]
        former_patch_name = former_patch.split(' - ')[0]
        latter_patch_name = latter_patch.split(' - ')[0]
        print(former_patch_name, latter_patch_name)
        if os.path.isfile(f"cache/cd_{former_patch_name}-{latter_patch_name}.pkl"):
            continue
        
        former_info, former, latter = read_patch_dialogs(former_patch, latter_patch)
        calculate_differences(former_patch_name, latter_patch_name, former_info, former, latter)

1.0 1.2
1.0 1.3
1.0 1.4
1.0 1.5
1.0 1.6


Finding differences per character: 100%|██████████| 2008/2008 [08:04<00:00,  4.15it/s]


1.2 1.3
1.2 1.4
1.2 1.5
1.2 1.6


Finding differences per character: 100%|██████████| 2008/2008 [08:22<00:00,  4.00it/s]


1.3 1.4
1.3 1.5
1.3 1.6


Finding differences per character: 100%|██████████| 2008/2008 [07:37<00:00,  4.39it/s]


1.4 1.5
1.4 1.6


Finding differences per character: 100%|██████████| 2005/2005 [07:00<00:00,  4.77it/s]


1.5 1.6


Finding differences per character: 100%|██████████| 2005/2005 [01:55<00:00, 17.40it/s]


In [None]:
patch_former = "1.0 - Launch Day"
patch_latter = "1.6 - Patch 6"

with open(f"data/read_caches/{patch_former}_former.pkl", 'rb') as pickle_file:
    former_pickle_dict = pickle.load(pickle_file)
f_lines_count, former_lines_dict, former_lines_info_dict = (former_pickle_dict["f_lines_count"],
former_pickle_dict["former_lines_dict"],
former_pickle_dict["former_lines_info_dict"])

with open(f"data/read_caches/{patch_latter}_latter.pkl", 'rb') as pickle_file:
    latter_pickle_dict = pickle.load(pickle_file)
l_lines_count, latter_lines_dict = (latter_pickle_dict["l_lines_count"],
                                           latter_pickle_dict["latter_lines_dict"])
    
print(f_lines_count, l_lines_count)
all_characters = set(list(former_lines_dict.keys()) + list(latter_lines_dict.keys()))
print(len(all_characters))

In [None]:
print(list(all_characters))

In [None]:
print(len("9ff83d30-3dc9-4fde-b173-7cc83914baf6"))

In [None]:
for c in all_characters:
    if len(c) > 30:
        print(c)

In [None]:
ac2 = set()
for c in all_characters:
    if len(c) > 35 and '-' in c:
        pass
    elif "_" in c:
        pass
    elif ", " in c:
        cs = c.split(", ")
        for ci in cs:
            ac2.add(ci)
    else:
        ac2.add(c)

In [None]:
print(len(ac2))

In [None]:
print(ac2)

In [None]:
print(len("77595417-3b1f-48b7-9737-993b5da48c81"))