In [1]:
from utils.wikiparser_utils import WikiXMLDump, WikiPage
import os
import nltk
import wikitextparser as wtp
import json
import numpy as np
from duckduckgo_search import ddg
from tqdm import tqdm
from difflib import Differ 
from utils.difflibparser import DifflibParser, DiffCode
# from transformers import T5TokenizerFast, T5ForConditionalGeneration
import shutil
import re

from collections import Counter
import os
import time
from os import listdir
from os.path import isfile, join

import mwxml
import wikitextparser as wtp

In [2]:
DOCS_DIR = 'downloaded_data/documents_new_val'
PAGES_DIR = 'downloaded_data/revision_new_val'

if not os.path.exists(DOCS_DIR):
    os.makedirs(DOCS_DIR)
    
if not os.path.exists(PAGES_DIR):
    os.makedirs(PAGES_DIR)


In [3]:
down_docs = os.listdir(DOCS_DIR)
down_page = os.listdir(PAGES_DIR)
len(down_docs), len(down_page)

(0, 0)

In [4]:
def filter_page(page_name):
    return bool(re.search('[a-zA-Z]', page_name))

def filter_comment(comment_text, user_name):
    com_text = comment_text.strip()
    if 'bot' in com_text or 'bot' in user_name:
        return False
    
    if com_text[-2:] == '*/':
        return False
    return True

In [5]:
MIN_CHANGE_SYMB_LEN = 5
MAX_CHANGE_SYMB_LEN = 300
MAX_PAGE_SYMB_LEN = 20000
MAX_ABSTRACT_LEN = 800
MIN_ABSTRACT_LEN = 10
MIN_COMMENT_LEN = 5
abstarct_tokenizer = lambda x: x

In [6]:
def find_nearest(l, r, arr):
    l_ans, r_ans = -1, -1
    for sent_idx, (l_arr, r_arr) in enumerate(zip(arr, arr[1:])):
        if l_arr <= l < r_arr:
            l_ans = sent_idx
        if l_arr <= r < r_arr:
            r_ans = sent_idx
    if l_ans == -1:
        l_ans = len(arr) - 1
    if r_ans == -1:
        r_ans = len(arr) - 1
    return l_ans, r_ans

In [7]:
def clean_text(text):
    text = text.replace('=====', '==').replace('====', '==').replace('===', '==')
    text = re.sub('\[\[File.*?]]', '', text, count=0, flags=0)
    text = re.sub('\[\[Category:.*?]]', '', text, count=0, flags=0)
    text = re.sub('\[\[category:.*?]]', '', text, count=0, flags=0)
    text = wtp.remove_markup(text)
    text = text.replace('\t', '').replace('\n\n\n', '\n\n').replace('\n\n\n', '\n\n')
    text = text.replace('\n\n*', ', ').replace('\n\n*', ', ')
    return text
    
def clean_section_text(text):
    text = re.sub('==.*?==+', '', text, count=0, flags=0)
    return text.strip()

def text2sentences(text, sent_tokenizer=nltk.sent_tokenize):
    idxs_arr = []
    sents = sent_tokenizer(text)
    cur_str = text[:]
    cur_skip = 0
    idxs2sent = {}
    for sent in sents:
        match_idx = cur_str.find(sent)
        start_idx = match_idx + cur_skip
        idxs_arr.append(start_idx)
        finish_idx = match_idx + cur_skip + len(sent) - 1
        idxs2sent[(start_idx, finish_idx)] = sent
        if finish_idx + 1 < len(cur_str):
            cur_skip = finish_idx + 1
            cur_str = cur_str[match_idx + len(sent):]
    return idxs2sent, np.array(sents), idxs_arr

def extract_important_sections(text):
    parsed_text = wtp.parse(text)
    section_titles, section_texts = [], []
    for sec in parsed_text.sections:
        if not sec.title:
            #for par in sec.string.split('\n\n'):
            section_titles.append(sec.title)
            section_texts.append(clean_section_text(sec.string))
            continue
        if 'external links' in sec.title.lower():
            continue
        if 'references' in sec.title.lower():
            continue
        if 'notes' in sec.title.lower():
            continue
        if 'see also' in sec.title.lower():
            continue
        
        #for par in sec.string.split('\n\n'):
        section_titles.append(sec.title)
        section_texts.append(clean_section_text(sec.string))
    return section_titles, section_texts

def get_diff_num(prev_sections_texts, new_sections_texts):
    prev_set = set(enumerate(prev_sections_texts))
    new_set = set(enumerate(new_sections_texts))
    new_diff = new_set - prev_set
    prev_diff = prev_set - new_set
    if len(new_diff) > 0 and len(prev_diff) > 0:
        print('\n\n#############################################')
        print('WAS:\n')
        print(new_diff)
        print('\n-------------------------------------------\nNOW:\n')
        print(prev_diff)
        print('\n')
        return True
    return False

def get_diff_num2(prev_sections_texts, new_sections_texts):
    differ_obj = Differ()
    dif_result = list(DifflibParser(prev_sections_texts, new_sections_texts))
    result = []
    result_idxs = []
    old_text, new_text, last_diff_id = [], [], -1000
    for dif_id, dif_line in enumerate(dif_result):
        if dif_line['code'] != DiffCode.SIMILAR:
            if np.abs(dif_id - last_diff_id) > 0:
                result.append(dif_line)
                result_idxs.append(dif_id)
                last_diff_id = dif_id
    return result_idxs, result    

def get_changes(diffs):
    all_changes = []
    all_changes_sents = []
    for diff_id, diff_obj in enumerate(diffs):
        if diff_obj['code'] == DiffCode.RIGHTONLY:
            if len(abstarct_tokenizer(diff_obj['line'])) > MAX_ABSTRACT_LEN:
                continue
            if len(abstarct_tokenizer(diff_obj['line'])) < MIN_ABSTRACT_LEN:
                continue
            all_changes.append(([diff_obj['line']], 'r'))
            _, sents, _ = text2sentences(diff_obj['line'])
            all_changes_sents.append(sents)
            
        elif diff_obj['code'] == DiffCode.LEFTONLY:
            if len(abstarct_tokenizer(diff_obj['line'])) > MAX_ABSTRACT_LEN:
                continue
            if len(abstarct_tokenizer(diff_obj['line'])) < MIN_ABSTRACT_LEN:
                continue
            all_changes.append(([diff_obj['line']], 'l'))
            _, sents, _ = text2sentences(diff_obj['line'])
            all_changes_sents.append(sents)
            
        elif diff_obj['code'] == DiffCode.CHANGED:
            if len(abstarct_tokenizer(diff_obj['line'])) > MAX_ABSTRACT_LEN:
                continue
            if len(abstarct_tokenizer(diff_obj['newline'])) > MAX_ABSTRACT_LEN:
                continue
            idxs2sent, sents, idxs_arr = text2sentences(diff_obj['newline'])
            all_changes_sents = []
            r_change = diff_obj['rightchanges']
            cur_ch = -10
            prev_ch = -10
            all_r_changes = []
            changed_sents = []
            for ch in r_change:
                if prev_ch < 0:
                    prev_ch = ch
                    cur_ch = ch
                if np.abs(ch - cur_ch) > 1:
                    new_change = diff_obj['newline'][prev_ch:cur_ch+1]
                    if new_change.strip() != '' and len(new_change.strip()) > MIN_CHANGE_SYMB_LEN:
                        all_r_changes.append(new_change)
                        sents_idxs_l, sents_idxs_r = find_nearest(prev_ch, cur_ch+1, idxs_arr)
                        changed_sents += list(range(sents_idxs_l, sents_idxs_r+1))
                    prev_ch = ch
                cur_ch = ch
            new_change = diff_obj['newline'][prev_ch:cur_ch+1]
            if new_change.strip() != '' and len(new_change.strip()) > MIN_CHANGE_SYMB_LEN:
                all_r_changes.append(new_change)
                sents_idxs_l, sents_idxs_r = find_nearest(prev_ch, cur_ch+1, idxs_arr)
                changed_sents += list(range(sents_idxs_l, sents_idxs_r+1))
            all_changes.append((all_r_changes, 'c'))
            changed_sents = sorted(list(set(changed_sents)))
            all_changes_sents.append(sents[changed_sents])
    return all_changes, all_changes_sents

In [8]:
import datetime
print(datetime.datetime.now())

2022-12-12 19:18:43.697625


## TF-IDF section

DOC_COUNTER = 0
W2DC = Counter()
dump = mwxml.Dump.from_file(open('data/history6_last.xml', encoding="utf-8"))
pbar = tqdm(position=0, leave=True)
for page in dump:
    revisions = []
    for rev in page:
        revisions.append(rev)
    last_rev = revisions[-1]
    last_rev_text = ''
    if last_rev.text:
        last_rev_text = clean_text(last_rev.text).lower()
    
    tokens = list(set(nltk.word_tokenize(last_rev_text)))
    W2DC.update(tokens)
    DOC_COUNTER += 1
    pbar.update(1)

W2DC.most_common(10)

DOC_COUNTER

## Docs downloading

In [9]:
import datetime
import time
start_time = datetime.datetime.now()
print(start_time)

2022-12-12 19:18:43.701608


In [10]:
def count_doc_score(doc_text):
    doc_toks = nltk.word_tokenize(doc_text.lower())
    toks_counter = Counter(doc_toks)
    score = 0.0
    for tok, tok_count in toks_counter.items():
        if W2DC[tok] > 0:
            score += tok_count * np.log(DOC_COUNTER / W2DC[tok])
    return score

## Main part

In [11]:
DUMPS = os.listdir('dump')
DUMPS

['val_dump.xml']

In [12]:
counter = 0
total_counter = 0

for dump_name in DUMPS:
    dump = mwxml.Dump.from_file(open(f'dump/{dump_name}', encoding="utf-8"))
    pbar = tqdm(position=0, leave=True)

    for page in dump:
        if not filter_page(page.title):
            continue
        revisions = []
        for rev in page:
            revisions.append(rev)

        if len(revisions) < 2: 
            continue
        good_revisions = []

        last_added = len(revisions)
        for cur_rev_id in range(len(revisions) - 1, 1, -1):
            if cur_rev_id >= last_added:
                continue
            cur_rev = revisions[cur_rev_id]
            if cur_rev.text:
                cur_rev_text = cur_rev.text # clean_text(cur_rev.text)
            else:
                cur_rev_text = ''

            for new_rev_id in range(cur_rev_id, 0, -1):
                new_rev = revisions[new_rev_id]

                if new_rev.text:
                    new_rev_text = new_rev.text # clean_text(new_rev.text)
                else:
                    new_rev_text = ''

                if cur_rev_text == new_rev_text:
                    last_added = new_rev_id

            add_rev = revisions[last_added]
            user = ''
            if add_rev.user:
                if add_rev.user.text:
                    user = add_rev.user.text.lower()
            revision_dict = {
                'text': cur_rev_text,
                'comment': add_rev.comment,
                'id': add_rev.id,
                'page_name': page.title,
                'user_name': user
            }
            good_revisions.append(revision_dict)
        good_revisions = good_revisions[::-1]

        for prev_rev, new_rev in zip(good_revisions[:], good_revisions[1:]):
            total_counter += 1
            comment = new_rev['comment']
            if comment and len(comment.strip()) > MIN_COMMENT_LEN:
                if filter_comment(comment, new_rev['user_name']):
                    if np.abs(len(new_rev['text']) - len(prev_rev['text'])) > MIN_CHANGE_SYMB_LEN:
                        if np.abs(len(new_rev['text']) - len(prev_rev['text'])) < MAX_CHANGE_SYMB_LEN:
                            if np.abs(len(new_rev['text'])) < MAX_PAGE_SYMB_LEN:
                                prev_section_titles, prev_section_texts = extract_important_sections(clean_text(prev_rev['text']))
                                new_section_titles, new_section_texts = extract_important_sections(clean_text(new_rev['text']))

                                r_idx, r = get_diff_num2(prev_section_texts, new_section_texts)
                                if len(r) == 1 and 'newline' in r[0]:
                                    # print(1)
                                    section_name = ''
                                    try:
                                        section_name_t = new_section_titles[r_idx[0]]
                                        if section_name_t:
                                            section_name = section_name_t
                                    except:
                                        pass
                                    ts = new_rev['page_name'] + ' ' + section_name

                                    all_changes_r, all_changes_sents_r = get_changes(r)
                                    if len(all_changes_sents_r) > 0 and len(all_changes_sents_r[0]) > 0:
                                        ts = new_rev['page_name'] + ' ' + section_name                                    

                                        final_page_path = f"{PAGES_DIR}/{counter}.json"
                                        final_docs_path = f"{DOCS_DIR}/{counter}.txt"
                                        if os.path.exists(final_page_path) and os.path.exists(final_docs_path):
                                            counter += 1
                                            continue

                                        downloaded_docs = []
                                        search_queries_list = []
                                        q2docs_num = []
                                        for ch_text_idx, ch_text in enumerate(all_changes_r[0][0]):
                                                fq = ts.strip() + ' ' + ch_text
                                                # print(f'Final search query {ch_text_idx}:\t', fq)
                                                search_queries_list.append(fq)
                                                search_result = ddg(fq)
                                                counter_found_docs = 0
                                                if search_result is not None:
                                                    for search_result_obj in search_result:
                                                        downloaded_docs.append(search_result_obj['body'])
                                                        counter_found_docs += 1
                                                q2docs_num.append(counter_found_docs)

                                        json_obj = {
                                            "old_text": r[0]['line'],
                                            "new_text": r[0]['newline'],
                                            "title": new_rev['page_name'],
                                            "comment": comment,
                                            "section_name": section_name,
                                            "search_queries": search_queries_list,
                                            "counter_found_docs": q2docs_num,
                                            "change_texts": all_changes_r
                                        }

                                        final_page_path = f"{PAGES_DIR}/{counter}.json"
                                        with open(final_page_path, 'w', encoding='utf-8') as f:
                                            json.dump(json_obj, f)
                                        # changed_text = [ctxt for ctxt in all_changes_r[0][0]]
                                        # changed_text_full = ' '.join(changed_text)

                                        final_docs_path = f"{DOCS_DIR}/{counter}.txt"
                                        with open(final_docs_path, 'w', encoding='utf-8') as f:
                                            for doc_text_idx, doc_text in enumerate(downloaded_docs):
                                                f.write(doc_text)
                                                if doc_text_idx != len(downloaded_docs) - 1:
                                                    f.write("\n\nDOC_DELIMITER_TOKEN\n\n")

                                        counter += 1
            pbar.update(1)
            pbar.set_postfix(downloaded_docs=counter)
            with open('total_counter', 'w') as f:
                f.write(str(total_counter))
            with open('counter', 'w') as f:
                f.write(str(counter))
            
            if counter > 120:
                break
                



7321it [03:56,  5.72it/s, downloaded_docs=149]

KeyboardInterrupt: 

In [13]:
counter, total_counter

(149, 7321)

7321it [04:10,  5.72it/s, downloaded_docs=149]

In [None]:
finish_time = datetime.datetime.now()
print(start_time)
print(finish_time)

In [None]:
print(start_time)
print(finish_time)

In [None]:
counter

In [None]:
45 * 775