In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm
import Levenshtein
import matplotlib.pyplot as plt
import fastDamerauLevenshtein

In [None]:
# splits string in words
def split_strings(str1, str2):
    lst = [str1.split()]
    lst.append(str2.split())
    return lst

# checks if wordcount in both strings is equal
def same_wordcounts(lst1, lst2):
    return (len(lst1) == len(lst2))

# deletes non alphabetical characters from string
def skip_no_alpha(string):
    only_alpha = ""
    for char in string:
        if char.isalpha():
            only_alpha += char
    return only_alpha

# checks in numbers are increments
def is_increment(nr1, nr2):
    return (nr1+1 == nr2 or nr1-1 == nr2)

# checks if case (upper/loewr) of the first latter is switched
def is_first_letter_caseswitch(str1,str2):
    return (str1[0].isupper() and str2[0].islower() or str1[0].islower() and str2[0].isupper())


In [None]:
def get_levenshtein_dists(lst1, lst2, only_alpha=False):
    if len(lst1) != len(lst2):
        print("Difference words counts of lists!")
        return
    dists = []
    if only_alpha:
        for i in range(len(lst1)):
            dists.append(int(fastDamerauLevenshtein.damerauLevenshtein(
                skip_no_alpha(lst1[i]), skip_no_alpha(lst2[i]), similarity=False)))
    else:
        for i in range(len(lst1)):
            dists.append(int(fastDamerauLevenshtein.damerauLevenshtein(
                lst1[i], lst2[i], similarity=False)))
    return dists

# splits strings in words


def get_words_and_dists(str1, str2, skip_no_alpha=False):
    words = split_strings(str1, str2)
    if len(words[0]) == len(words[1]):
        dists = get_levenshtein_dists(words[0], words[1], skip_no_alpha)
    else:
        dists = []
    return words, dists


In [None]:
def word_in_dict(str1, words_dict):
    return str1 in words_dict

def is_typo_fixed(str1, str2, words_dict, lowercase=True):
    """ Check if typo is fixed.
        return 0: no other case is found
        return 1: word was not in dict before (misspelled)
        return 2: word with swapped first letter (and other changes depending on edit distance)
    """
    # detects number errors (dreher,tippfehler), skippt increments
    # TODO: problem bei editwars für date of birth
    if str1.isdigit() and str2.isdigit() and not is_increment(int(str1),int(str2)):
        return 3

    # if is_first_letter_caseswitch(str1,str2):
    #     return 2

    # if lowercase:
    #     str1=str1.lower()
    #     str2=str2.lower()

    # if (not word_in_dict(str1, words_dict) and word_in_dict(str2, words_dict)):
    #     return 1
        
    return 0

In [None]:
def get_typo_type(str1, str2, words_dict, only_alpha=False):
    words, levenshtein_dists = get_words_and_dists(
        str1, str2, only_alpha)
    typo_lst = []
    for i in range(len(levenshtein_dists)):  # only loops if dists are found (word counts are equal)
        if(levenshtein_dists[i] > 0 and levenshtein_dists[i] <= 2):  # only uses distances >0 <=2
            typo_lst.append(is_typo_fixed(
                words[0][i], words[1][i], words_dict))
        # else:  # appends None if dist is <0 or >2
        #     typo_lst.append(None)
    return typo_lst


In [None]:
my_file = open("../../../words_alpha.txt", "r")
words_dict=set(my_file.read().split())

testcase1 = ["Hier sind kkeine Fheler", "Hier sind keine Fehler"]
testcase1_en = ["There are nno erorrs", "There are no errors"]
typo_lst = get_typo_type(testcase1_en[0], testcase1_en[1], words_dict)
print(typo_lst)

In [None]:
def typo_check(str1, str2, words_dict, skip_no_alpha=False):
    """Return True if typo
       Return False if no typo
    """
    typo_lst = get_typo_type(str1, str2, words_dict, skip_no_alpha)
    for typo_type in typo_lst:
        if typo_type > 0:  # 1 is previous not in dict, current is in dict, 2 case switch on first letter
            return True
    return False

In [None]:
input_data = Path("../../matched-infoboxes-extracted/")
inp = list(input_data.rglob('*.json'))
files = [x for x in inp if x.is_file()]
len(files) 

In [None]:
number_of_files = 20
num_edits = 0
change_tuples = []
for file in tqdm(files[:number_of_files]):
    with open(file, 'r', encoding='utf-8') as f:
        for jsonObj in f:
            single_edit = json.loads(jsonObj)
            num_edits += 1
            title = single_edit['pageTitle']
            pageID = single_edit['pageID']
            key = single_edit['key']
            template = single_edit['template'] if 'template' in single_edit.keys(
            ) else None
            changes = single_edit['changes']
            timestamp = single_edit['validFrom']
            revisionId = single_edit['revisionId']
            attributes = single_edit['attributes'] if 'attributes' in single_edit.keys(
            ) else None
            # print(single_edit['user'])
            user_name = single_edit['user']['username'] if 'username' in single_edit['user'].keys(
            ) else None
            user_id = single_edit['user']['id'] if 'id' in single_edit['user'].keys(
            ) else None
            user_ip = single_edit['user']['ip'] if 'ip' in single_edit['user'].keys(
            ) else None
            for change in changes:
                name = change['property']['name']
                current_value = change['currentValue'] if 'currentValue' in change.keys(
                ) else None
                previous_value = change['previousValue'] if 'previousValue' in change.keys(
                ) else None
                change_tuples.append((title, pageID, key, template, name, previous_value,
                                     current_value, timestamp, revisionId, user_name, user_id, user_ip, attributes))
print("Number of edits:", num_edits)
print("Number of change tuples:", len(change_tuples))

In [None]:
my_file = open("../../../words_alpha.txt", "r")
words_dict=set(my_file.read().split())

In [None]:
typo_lst = []
for i in tqdm(range(len(change_tuples))):
    # Check only changes (no creations/deletions)
    if(change_tuples[i][5] is not None and change_tuples[i][6] is not None):
        typo_lst.append(typo_check(
            change_tuples[i][5], change_tuples[i][6], words_dict, False))
    else:
        typo_lst.append(None)

In [None]:
counts = {"typo fixed": 0,
          "no typo": 0,
          "not tested": 0
          }
for typo in typo_lst:
    if typo is True:
        counts["typo fixed"] += 1
    if typo is False:
        counts["no typo"] += 1
    if typo is None:
        counts["not tested"] += 1
print(counts)
print(counts["typo fixed"]+counts["no typo"]+counts["not tested"])

In [None]:
print("typo fixed:", counts["typo fixed"])
print("no typo:", counts["no typo"])
print("not tested:", counts["not tested"])
print("typo fix %:",counts["typo fixed"]/(counts["typo fixed"]+counts["no typo"]))

In [None]:
# get idx of all typos
idx = []
for i in range(len(typo_lst)):
    if typo_lst[i] == True:
        idx.append(i)

## Dataframe

In [None]:
data = pd.DataFrame(change_tuples, columns=['pageTitle', 'pageID', 'key', 'template', 'name', 'previous_value', 'current_value', 'timestamp', 'revisionId', 'user_name', 'user_id', 'user_ip', 'attributes'])
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [None]:
data.iloc[idx].tail(20)

## Swear words

In [None]:
def check_swear(str1, str2, words_dict, lowercase=True):
    """ Check if swear got added or removed.
        Input:
            str1: prev string
            str2: curr string
        Output:
        prev false , curr true : 1 (swear word added)
        prev true , curr false : 2 (swear word removed)
    """
    if lowercase:
        str1=str1.lower()
        str2=str2.lower()

    str1_lst=str1.split()
    str2_lst=str2.split()

    prev_swear=False
    curr_swear=False
    for string in str1_lst:
        if word_in_dict(string, words_dict):
            prev_swear=True
            break

    for string in str2_lst:
        if word_in_dict(string, words_dict):
            curr_swear=True
            break

    if (not prev_swear and curr_swear):
        # swear word added
        return 1
    if (prev_swear and not curr_swear):
        # swear word removed
        return 2
    if (prev_swear and  curr_swear):
        # swear word in both
        return 3
    if (not prev_swear and not curr_swear):
        # swear word in none
        return 0

In [None]:
swear_file = open("../../../words_swear.txt", "r")
swear_dict = set(swear_file.read().split())

swear_lst = []
for i in tqdm(range(len(change_tuples))):
    if(change_tuples[i][5] is not None and change_tuples[i][6] is not None):
        swear_lst.append(check_swear(
            change_tuples[i][5], change_tuples[i][6], swear_dict))
    else:
        swear_lst.append(None)


counts_swear = {"Swearwords added": 0,
                "Swearwords removed": 0,
                "Swearwords not touched": 0,
                "Swearwords not found": 0,
                "create or delete (skipped)": 0}
for test in swear_lst:
    if test is 1:
        counts_swear["Swearwords added"] += 1
    if test is 2:
        counts_swear["Swearwords removed"] += 1
    if test is 3:
        counts_swear["Swearwords not touched"] += 1
    if test is 0:
        counts_swear["Swearwords not found"] += 1
    if test is None:
        # prev or curr is None
        counts_swear["create or delete (skipped)"] += 1
print(counts_swear)

idx_swear = [[], []]
for i in range(len(swear_lst)):
    if swear_lst[i] == 1:
        idx_swear[0].append(i)
    if swear_lst[i] == 2:
        idx_swear[1].append(i)


In [None]:
print("Swearwords added:", counts_swear["Swearwords added"])
print("Swearwords removed:", counts_swear["Swearwords removed"])
print("Swearwords not touched:", counts_swear["Swearwords not touched"])
print("Swearwords not found:", counts_swear["Swearwords not found"])
print("create or delete (skipped):", counts_swear["create or delete (skipped)"])
edit_count=0
for i, (k, v) in enumerate(counts_swear.items()):
    if i > 3:
        break
    edit_count += v
print("Toal words (without create/delete):", edit_count)
print("Percentage of swear words in edits added and removed:",
      counts_swear["Swearwords added"]/edit_count, counts_swear["Swearwords removed"]/edit_count)


## Swear words added

In [None]:
# todo: how handle empty strings?
data.iloc[idx_swear[0]].head(50)

## Swear words removed

In [None]:
data.iloc[idx_swear[1]].head(50)