In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm
import Levenshtein
import matplotlib.pyplot as plt
import fastDamerauLevenshtein

In [2]:
# import os
# import random
# rnd_files = random.sample(os.listdir("../../../matched-infoboxes/"), 20)


In [3]:
# from shutil import copyfile
# for fil in rnd_files:
#     copyfile("../../../matched-infoboxes/"+fil, "../../../batch/"+fil)

### How does it work
Typos:
- previous and current values are splittet into words with non letters/whitespaces removed, if any are None or empty they are skipped
- only same wordcounts are tested (it is expected that words stay in the same order)
- words are compared to the words with the same index with Damerau-Levenshtein edit distance (swaps are cost 1 not 2)
- if any word has edit distance 1 or 2 it is further looked at
    1. test if first letter is a case swap (it is expected the user knows what is correct)
    2. test if previous word is not in a dictionary but current word is (comparison is done in lowercase)
- if any of the tests is true the change is marked as typo-fix

Swear words:
- previous and current values are splittet into words, if any are None or empty they are skipped
- words are compared to a swear word dictionary (https://github.com/RobertJGabriel/Google-profanity-words/blob/master/list.txt)
- if any word is matched the value is flagged as swear word
- if previous value has no swear word but current value has -> swear word added
- if previous value has a swear word but current value has not -> swear word deleted

In [4]:
# splits string in words
def split_strings(str1, str2):
    lst = [str1.split()]
    lst.append(str2.split())
    return lst

# checks if wordcount in both strings is equal


def same_wordcounts(lst1, lst2):
    return (len(lst1) == len(lst2))

# deletes non alphabetical characters from string


def skip_no_alpha(string):
    only_alpha = ""
    for char in string:
        if char.isalpha() or char == " ":
            only_alpha += char
    return only_alpha

# checks in numbers are increments


def is_increment(nr1, nr2):
    return (nr1+1 == nr2 or nr1-1 == nr2)

# checks if case (upper/loewr) of the first latter is switched


def is_first_letter_caseswitch(str1, str2):
    return (str1[0].isupper() and str2[0].islower() or str1[0].islower() and str2[0].isupper())


def is_not_empty_or_none(input):
    return input is not None and input is not ""


In [5]:
def get_levenshtein_dists(lst1, lst2):
    if len(lst1) != len(lst2):
        print("Difference words counts of lists!")
        return
    dists = []
    for i in range(len(lst1)):
            dists.append(int(fastDamerauLevenshtein.damerauLevenshtein(
                lst1[i], lst2[i], similarity=False)))
    return dists

# splits strings in words
def get_words_and_dists(str1, str2, only_alpha=False):
    if only_alpha:
        str1=skip_no_alpha(str1)
        str2=skip_no_alpha(str2)
    words = split_strings(str1, str2)
    if len(words[0]) == len(words[1]):
        dists = get_levenshtein_dists(words[0], words[1])
    else:
        dists = []
    return words, dists


In [6]:
def word_in_dict(str1, words_dict):
    return str1 in words_dict

def is_typo_fixed(str1, str2, words_dict, lowercase=True):
    """ Check if typo is fixed.
        return 0: no other case is found
        return 1: word was not in dict before (missspelled)
        return 2: word with swapped first letter (and other changes depending on edit distance)
    """
    # detects number errors (dreher,tippfehler), skipps increments. Only works if skip_no_alpha is false 
    if str1.isdigit() and str2.isdigit() and not is_increment(int(str1),int(str2)):
        return 3

    if is_first_letter_caseswitch(str1,str2):
        return 2

    if lowercase:
        str1=str1.lower()
        str2=str2.lower()

    # checks if str1 is not in dict but str2 is
    if (not word_in_dict(str1, words_dict) and word_in_dict(str2, words_dict)):
        return 1
        
    return 0

In [7]:
def get_typo_type(str1, str2, words_dict, upper_lev_distance=2, skip_no_alpha=False):
    words, levenshtein_dists = get_words_and_dists(
        str1, str2, skip_no_alpha)
    typo_lst = []
    for i in range(len(levenshtein_dists)):  # only loops if dists are found (word counts are equal)
        # only uses distances >0 <=2
        if(levenshtein_dists[i] > 0 and levenshtein_dists[i] <= upper_lev_distance):
            typo_lst.append(is_typo_fixed(
                words[0][i], words[1][i], words_dict))
        # else:  # appends None if dist is <0 or >2
        #     typo_lst.append(None)
    return typo_lst


In [8]:
my_file = open("../../../words_alpha.txt", "r")
words_dict=set(my_file.read().split("\n"))

testcase1 = ["Hier sind kkeine Fheler", "Hier sind keine Fehler"]
testcase1_en = ["There are nno erorrs", "There are no errors"]
typo_lst = get_typo_type(testcase1_en[0], testcase1_en[1], words_dict)
print(typo_lst)

[1, 1]


In [9]:
def typo_check(str1, str2, words_dict, upper_lev_distance=2, skip_no_alpha=False):
    """Return True if typo
       Return False if no typo
    """
    typo_lst = get_typo_type(str1, str2, words_dict, upper_lev_distance, skip_no_alpha)
    if len(typo_lst) == 0:
        return None
    for typo_type in typo_lst:
        if typo_type > 0:  # 1 is previous not in dict, current is in dict, 2 case switch on first letter
            return True
    return False

def typo_check_pandas(row, words_dict, upper_lev_distance=2, skip_no_alpha=False):
    """Return True if typo
       Return False if no typo
    """
    typo_lst = get_typo_type(row["currentValue"], row["previousValue"], words_dict, upper_lev_distance, skip_no_alpha)
    if len(typo_lst) == 0:
        return None
    for typo_type in typo_lst:
        if typo_type > 0:  # 1 is previous not in dict, current is in dict, 2 case switch on first letter
            return True
    return False

In [54]:
data

Unnamed: 0,pageTitle,pageID,key,template,name,previousValue,currentValue,validFrom,validTo,revisionId,user_name,user_id,user_ip,attributes


In [15]:
data["isTypo"]=data.apply(lambda row: typo_check_pandas(
    row, words_dict, upper_lev_distance=2, skip_no_alpha=False),axis=1)

In [10]:
def count_typos(typo_lst):
    counts = {"typo fixed": 0,
              "no typo": 0,
              "no levenshstein match": 0,
              "not tested": 0
              }
    for typo in typo_lst:
        if typo is True:
            counts["typo fixed"] += 1
        if typo is False:
            counts["no typo"] += 1
        if typo is None:
            counts["no levenshstein match"] += 1
        if typo is "not_tested":
            counts["not tested"] += 1
    counts["tested"] = counts["typo fixed"] + \
        counts["no typo"]+counts["no levenshstein match"]
    print("typo fixed:", counts["typo fixed"])
    print("no typo:", counts["no typo"])
    print("no levenshstein match:", counts["no levenshstein match"])
    print("not tested:", counts["not tested"])
    print("total changes:", counts["typo fixed"]+counts["no typo"] +
          counts["not tested"]+counts["no levenshstein match"])
    print("% of matching levenshtein distance:",
          (counts["typo fixed"]+counts["no typo"])/counts["tested"]*100)
    print("typo fix % of tested changes (only updates without creations/deletions):",
          counts["typo fixed"]/counts["tested"]*100)
    print("typo fix % of all changes:",
          (counts["typo fixed"]/(counts["typo fixed"]+counts["no typo"]+counts["not tested"]))*100)


In [11]:
input_data = Path("../../matched-infoboxes-extracted/")
inp = list(input_data.rglob('*.json'))
files = [x for x in inp if x.is_file()]
print("number of files:", len(files))

number of files: 20


In [40]:
fileshare_path=Path("Z:\mp2021\mpws2021\MPWS2021FN1")
my_file = open(fileshare_path / ", "r")
words_dict=set(my_file.read().split("\n"))

In [26]:
num_iteratons = 1
number_of_files = 1
number_of_files_start = 10
num_edits = 0
num_change_tuples = 0
typo_lst = []
timedeltas_all = []
timedeltas_typo = []
timedeltas_levensh = []
for _ in range(num_iteratons):
    change_tuples = []
    for file in tqdm(files[number_of_files_start:number_of_files_start+number_of_files]):
        with open(file, 'r', encoding='utf-8') as f:
            for jsonObj in f:
                single_edit = json.loads(jsonObj)
                num_edits += 1
                title = single_edit['pageTitle']
                pageID = single_edit['pageID']
                key = single_edit['key']
                template = single_edit['template'] if 'template' in single_edit.keys(
                ) else None
                changes = single_edit['changes']
                validFrom = single_edit['validFrom']
                revisionId = single_edit['revisionId']
                attributes = single_edit['attributes'] if 'attributes' in single_edit.keys(
                ) else None
                user_name = single_edit['user']['username'] if 'username' in single_edit['user'].keys(
                ) else None
                user_id = single_edit['user']['id'] if 'id' in single_edit['user'].keys(
                ) else None
                user_ip = single_edit['user']['ip'] if 'ip' in single_edit['user'].keys(
                ) else None
                for change in changes:
                    name = change['property']['name']
                    current_value = change['currentValue'] if 'currentValue' in change.keys(
                    ) else None
                    previous_value = change['previousValue'] if 'previousValue' in change.keys(
                    ) else None
                    validTo = change['valueValidTo'] if 'valueValidTo' in change.keys(
                    ) else None
                    change_tuples.append((title, pageID, key, template, name, previous_value,
                                          current_value, validFrom, validTo, revisionId, user_name, user_id, user_ip, attributes))

        data = pd.DataFrame(change_tuples, columns=['pageTitle', 'pageID', 'key', 'template', 'name', 'previousValue',
                                                    'currentValue', 'validFrom', 'validTo', 'revisionId', 'user_name', 'user_id', 'user_ip', 'attributes'])
        num_change_tuples += len(data)
        data = data[(data["currentValue"] != "") & (~data["currentValue"].isnull())]
        data = data[(data["previousValue"] != "") & (~data["previousValue"].isnull())]
        data = data[(data["validTo"] != "") & (~data["validTo"].isnull())]
        data['validFrom'] = pd.to_datetime(data['validFrom'])
        data['validTo'] = pd.to_datetime(data['validTo'])

        timedeltas_all.extend(data["validTo"]-data['validFrom'])

        data["isTypo"] = data.apply(lambda row: typo_check_pandas(
            row, words_dict, upper_lev_distance=2, skip_no_alpha=False), axis=1)

        timedeltas_typo.extend(data[data["isTypo"]==True]["validTo"]-data[data["isTypo"]==True]['validFrom'])
        timedeltas_levensh.extend(data[~data["isTypo"].isnull()]["validTo"]-data[~data["isTypo"].isnull()]['validFrom'])

    number_of_files_start += number_of_files

# upper_lev_dist=2
# for i in tqdm(range(len(change_tuples))):
#     # Check only changes (no creations/deletions)
#     if(is_not_empty_or_none(change_tuples[i][5]) and is_not_empty_or_none(change_tuples[i][6])):
#         typo_lst.append(typo_check(
#             change_tuples[i][5], change_tuples[i][6], words_dict, upper_lev_dist, True))

print("Read data:")
print("Number of edits:", num_edits)
print("Number of change tuples:", num_change_tuples)
print("\nProcessed data:")
# count_typos(typo_lst)

100%|██████████| 1/1 [00:38<00:00, 38.99s/it]

Read data:
Number of edits: 114795
Number of change tuples: 550725

Processed data:





In [38]:
np.array(timedeltas_all,dtype=np.timedelta64).mean()/np.timedelta64(1,"D")

434.7692041875579

In [17]:
data[(data["validFrom"]!="") | (data["validFrom"]!=None)]

Unnamed: 0,pageTitle,pageID,key,template,name,previous_value,current_value,validFrom,validTo,revisionId,user_name,user_id,user_ip,attributes
0,Shlishkes,14876965,500826571-0,infobox prepared food,image,,,2012-07-05 17:42:27+00:00,NaT,500826571,Mindmatrix,160367.0,,"{'image': '', 'country': '[[Hungary]]', 'alter..."
1,Shlishkes,14876965,500826571-0,infobox prepared food,type,,[[Dumpling]],2012-07-05 17:42:27+00:00,NaT,500826571,Mindmatrix,160367.0,,"{'image': '', 'country': '[[Hungary]]', 'alter..."
2,Shlishkes,14876965,500826571-0,infobox prepared food,course,,,2012-07-05 17:42:27+00:00,NaT,500826571,Mindmatrix,160367.0,,"{'image': '', 'country': '[[Hungary]]', 'alter..."
3,Shlishkes,14876965,500826571-0,infobox prepared food,served,,,2012-07-05 17:42:27+00:00,NaT,500826571,Mindmatrix,160367.0,,"{'image': '', 'country': '[[Hungary]]', 'alter..."
4,Shlishkes,14876965,500826571-0,infobox prepared food,calories,,,2012-07-05 17:42:27+00:00,NaT,500826571,Mindmatrix,160367.0,,"{'image': '', 'country': '[[Hungary]]', 'alter..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550720,IOSYS,14905962,275123087-0,infobox company,logo,[[Image:iosys.jpg]],iosys.jpg,2017-01-23 00:03:32+00:00,NaT,761433732,Karunamon,992275.0,,"{'parent': '', 'location_country': '[[Japan]]'..."
550721,IOSYS,14905962,275123087-0,infobox company,foundation,"[[Sapporo]], [[Japan]] ({{Start date|1998|10|1...","{{Start date|1998|10|10}}<br>[[Sapporo]], [[Ja...",2018-02-21 17:34:44+00:00,2019-02-19 11:37:01+00:00,826909494,VibeScepter,32749056.0,,"{'parent': '', 'location_country': '[[Japan]]'..."
550722,IOSYS,14905962,275123087-0,infobox company,foundation,"{{Start date|1998|10|10}}<br>[[Sapporo]], [[Ja...",{{Start date|1998|10|10}}<ref name=DTMmag_2008...,2019-02-19 11:37:01+00:00,2019-03-14 06:04:03+00:00,884079508,Darklanlan,15296459.0,,"{'parent': '', 'location_country': '[[Japan]]'..."
550723,IOSYS,14905962,275123087-0,infobox company,foundation,{{Start date|1998|10|10}}<ref name=DTMmag_2008...,{{Start date|1998|10|10}}<ref name=DTMmag_2008...,2019-03-14 06:04:03+00:00,NaT,887691026,Citation bot,7903804.0,,"{'parent': '', 'location_country': '[[Japan]]'..."


In [None]:
# get idx of all fixed typos
typo_idx = []
for i in range(len(typo_lst)):
    if typo_lst[i] == True:
        typo_idx.append(i)

matching_lev_idx = []
for i in range(len(typo_lst)):
    if typo_lst[i] == True or typo_lst[i] == False:
        matching_lev_idx.append(i)

all_updates_idx = []
for i in range(len(typo_lst)):
    if typo_lst[i] != "not tested":
        all_updates_idx.append(i)

## Time to Change

In [None]:
from datetime import datetime

def timedeltas_between_changes(typo_idx, change_tuples):
    time_deltas = []
    for idx in typo_idx:
        if change_tuples[idx][7] is not None and change_tuples[idx][8] is not None:
            time_delta = datetime.strptime(change_tuples[idx][8], '%Y-%m-%dT%H:%M:%SZ')-datetime.strptime(
                change_tuples[idx][7], '%Y-%m-%dT%H:%M:%SZ')
            time_deltas.append(time_delta)
    return time_deltas

In [None]:
import numpy as np

def timedelta_to_seconds(arr): return arr.total_seconds()
def timedelta_to_hours(arr): return arr.total_seconds()/60/60
def timedelta_to_days(arr): return arr.total_seconds()/60/60/24
def timedelta_to_days_int(arr): return arr.days

timedelta_to_seconds = np.vectorize(timedelta_to_seconds)
timedelta_to_hours = np.vectorize(timedelta_to_hours)
timedelta_to_days = np.vectorize(timedelta_to_days)
timedelta_to_days_int = np.vectorize(timedelta_to_days_int)

In [None]:
# fixed typos
time_deltas = timedeltas_between_changes(typo_idx, change_tuples)
time_deltas = np.array(time_deltas)
print("Average Time to change for a typofix")
print("Median time in days", np.median(timedelta_to_days(time_deltas)))
print("Median time in hours", np.median(timedelta_to_hours(time_deltas)))
print("Median time in seconds", np.median(timedelta_to_seconds(time_deltas)))
print("timedelta mean and std in days:", np.mean(
    timedelta_to_days(time_deltas)), np.std(timedelta_to_days(time_deltas)))
print("timedelta mean:", str(time_deltas.mean()))
print("number of samples:", len(time_deltas))

In [None]:
import seaborn as sns
sns.boxplot(timedelta_to_days(time_deltas),showfliers=False)

In [None]:
import seaborn as sns

ax = sns.histplot(timedelta_to_days(time_deltas))
ax.set(xlabel='time to change a typo in days', ylabel='count', title='Time to change a typo')
plt.show()

In [None]:
# matching levenshtein distance
time_deltas_lev = timedeltas_between_changes(matching_lev_idx, change_tuples)
time_deltas_lev = np.array(time_deltas_lev)
print("Average Time to change for a typofix")
print("Median time in days", np.median(timedelta_to_days(time_deltas_lev)))
print("Median time in hours", np.median(timedelta_to_hours(time_deltas_lev)))
print("Median time in seconds", np.median(timedelta_to_seconds(time_deltas_lev)))
print("timedelta mean and std in days:", np.mean(
    timedelta_to_days(time_deltas_lev)), np.std(timedelta_to_days(time_deltas_lev)))
print("timedelta mean:", str(time_deltas_lev.mean()))
print("number of samples:",len(time_deltas_lev))

In [None]:
ax = sns.histplot(timedelta_to_days(time_deltas_lev))
ax.set(xlabel='time to change a typo in days', ylabel='count', title='Time to change a typo')
plt.show()

In [None]:
# all tested data
time_deltas_tested = timedeltas_between_changes(all_updates_idx, change_tuples)
time_deltas_tested = np.array(time_deltas_tested)
print("Average Time to change for a typofix")
print("Median time in days", np.median(timedelta_to_days(time_deltas_tested)))
print("Median time in hours", np.median(timedelta_to_hours(time_deltas_tested)))
print("Median time in seconds", np.median(timedelta_to_seconds(time_deltas_tested)))
print("timedelta mean and std in days:", np.mean(
    timedelta_to_days(time_deltas_tested)), np.std(timedelta_to_days(time_deltas_tested)))
print("timedelta mean:", str(time_deltas_tested.mean()))
print("number of samples:",len(time_deltas_tested))

## Dataframe

In [None]:
data = pd.DataFrame(change_tuples, columns=['pageTitle', 'pageID', 'key', 'template', 'name', 'previous_value',
                    'current_value', 'validFrom', 'validTo', 'revisionId', 'user_name', 'user_id', 'user_ip', 'attributes'])
data['validFrom'] = pd.to_datetime(data['validFrom'])
data['validTo'] = pd.to_datetime(data['validTo'])

In [None]:
data.iloc[typo_idx].head(5)

In [None]:
timedeltas = data["validTo"]-data["validFrom"]
timedeltas.median()

In [None]:
data_typo = data.iloc[typo_idx]
timedeltas_typo=data_typo["validTo"]-data_typo["validFrom"]
timedeltas_typo.median()

## Swear words

In [None]:
def check_swear(str1, str2, words_dict, lowercase=True):
    """ Check if swear got added or removed.
        Input:
            str1: prev string
            str2: curr string
        Output:
        prev false , curr true : 1 (swear word added)
        prev true , curr false : 2 (swear word removed)
    """
    if lowercase:
        str1=str1.lower()
        str2=str2.lower()

    str1_lst=str1.split()
    str2_lst=str2.split()

    prev_swear=False
    curr_swear=False
    for string in str1_lst:
        if word_in_dict(string, words_dict):
            prev_swear=True
            break

    for string in str2_lst:
        if word_in_dict(string, words_dict):
            curr_swear=True
            break

    if (not prev_swear and curr_swear):
        # swear word added
        return 1
    if (prev_swear and not curr_swear):
        # swear word removed
        return 2
    if (prev_swear and  curr_swear):
        # swear word in both
        return 3
    if (not prev_swear and not curr_swear):
        # swear word in none
        return 0

In [None]:
swear_file = open("../../../words_swear.txt", "r")
swear_dict = set(swear_file.read().split("\n"))
swear_dict.remove("nazi") # nazi is mostly no swear word in the context

def is_not_empty_or_none(input):
    return input is not None and input is not ""


swear_lst = []
for i in tqdm(range(len(change_tuples))):
    if(is_not_empty_or_none(change_tuples[i][5]) and is_not_empty_or_none(change_tuples[i][6])):
        swear_lst.append(check_swear(
            change_tuples[i][5], change_tuples[i][6], swear_dict))
    else:
        swear_lst.append(None)


counts_swear = {"Swearwords added": 0,
                "Swearwords removed": 0,
                "Swearwords not touched": 0,
                "Swearwords not found": 0,
                "create or delete (skipped)": 0}
for test in swear_lst:
    if test is 1:
        counts_swear["Swearwords added"] += 1
    if test is 2:
        counts_swear["Swearwords removed"] += 1
    if test is 3:
        counts_swear["Swearwords not touched"] += 1
    if test is 0:
        counts_swear["Swearwords not found"] += 1
    if test is None:
        # prev or curr is None
        counts_swear["create or delete (skipped)"] += 1
print(counts_swear)

idx_swear = [[], []]
for i in range(len(swear_lst)):
    if swear_lst[i] == 1:
        idx_swear[0].append(i)
    if swear_lst[i] == 2:
        idx_swear[1].append(i)


In [None]:
print("Swearwords added:", counts_swear["Swearwords added"])
print("Swearwords removed:", counts_swear["Swearwords removed"])
print("Swearwords not touched:", counts_swear["Swearwords not touched"])
print("Swearwords not found:", counts_swear["Swearwords not found"])
print("create or delete (skipped):", counts_swear["create or delete (skipped)"])
edit_count = counts_swear["Swearwords added"]+counts_swear["Swearwords removed"] + \
    counts_swear["Swearwords not touched"]+counts_swear["Swearwords not found"]
print("Toal tuples (only updates without creations/deletions):", edit_count)
print("Toal tuples:", edit_count+counts_swear["create or delete (skipped)"])
print("Percentage of swear words in edits (only updates without creations/deletions) added and removed:",
      counts_swear["Swearwords added"]/edit_count, counts_swear["Swearwords removed"]/edit_count)


## Swear words added

In [None]:
time_deltas_swear = timedeltas_between_changes(idx_swear[0], change_tuples)
time_deltas_swear = np.array(time_deltas_swear)
print("Average Time to change for a typofix")
print("Median time in days", np.median(timedelta_to_days(time_deltas_swear)))
print("Median time in hours", np.median(timedelta_to_hours(time_deltas_swear)))
print("Median time in seconds", np.median(timedelta_to_seconds(time_deltas_swear)))
print("timedelta mean and std in days:", np.mean(
    timedelta_to_days(time_deltas_swear)), np.std(timedelta_to_days(time_deltas_swear)))
print("timedelta mean:", str(time_deltas_swear.mean()))

In [None]:
def removeOutliers(data, percentile):
    lower_quartile = np.percentile(data, percentile)
    upper_quartile = np.percentile(data, 100-percentile)
    if lower_quartile == upper_quartile:
        return data
    print(lower_quartile, upper_quartile)
    data = data[data >= lower_quartile]
    data = data[data < upper_quartile]
    return data

In [None]:
# 
# ax = sns.histplot(timedelta_to_hours(time_deltas_swear))
# ax.set(xlabel='time to remove a swear word in days', ylabel='count', title='Time to remove a swear word')
# plt.show()

In [None]:
data.iloc[idx_swear[0]].head(10)

## Swear words removed

In [None]:
data.iloc[idx_swear[1]].head(10)