In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm
import Levenshtein
import matplotlib.pyplot as plt
import fastDamerauLevenshtein

### How does it work
Typos:
- previous and current values are splittet into words with non letters/whitespaces removed, if any are None or empty they are skipped
- only same wordcounts are tested (it is expected that words stay in the same order)
- words are compared to the words with the same index with Damerau-Levenshtein edit distance (swaps are cost 1 not 2)
- if any word has edit distance 1 or 2 it is further looked at
    1. test if first letter is a case swap (it is expected the user knows what is correct)
    2. test if previous word is not in a dictionary but current word is (comparison is done in lowercase)
- if any of the tests is true the change is marked as typo-fix

Swear words:
- previous and current values are splittet into words, if any are None or empty they are skipped
- words are compared to a swear word dictionary (https://github.com/RobertJGabriel/Google-profanity-words/blob/master/list.txt)
- if any word is matched the value is flagged as swear word
- if previous value has no swear word but current value has -> swear word added
- if previous value has a swear word but current value has not -> swear word deleted

In [4]:
# splits string in words
def split_strings(str1, str2):
    lst = [str1.split()]
    lst.append(str2.split())
    return lst

# checks if wordcount in both strings is equal


def same_wordcounts(lst1, lst2):
    return (len(lst1) == len(lst2))

# deletes non alphabetical characters from string


def skip_no_alpha(string):
    only_alpha = ""
    for char in string:
        if char.isalpha() or char == " ":
            only_alpha += char
    return only_alpha

# checks in numbers are increments


def is_increment(nr1, nr2):
    return (nr1+1 == nr2 or nr1-1 == nr2)

# checks if case (upper/loewr) of the first latter is switched


def is_first_letter_caseswitch(str1, str2):
    return (str1[0].isupper() and str2[0].islower() or str1[0].islower() and str2[0].isupper())


def is_not_empty_or_none(input):
    return input is not None and input is not ""


In [5]:
def get_levenshtein_dists(lst1, lst2):
    if len(lst1) != len(lst2):
        print("Difference words counts of lists!")
        return
    dists = []
    for i in range(len(lst1)):
            dists.append(int(fastDamerauLevenshtein.damerauLevenshtein(
                lst1[i], lst2[i], similarity=False)))
    return dists

# splits strings in words
def get_words_and_dists(str1, str2, only_alpha=False):
    if only_alpha:
        str1=skip_no_alpha(str1)
        str2=skip_no_alpha(str2)
    words = split_strings(str1, str2)
    if len(words[0]) == len(words[1]):
        dists = get_levenshtein_dists(words[0], words[1])
    else:
        dists = []
    return words, dists


In [6]:
def word_in_dict(str1, words_dict):
    return str1 in words_dict

def is_typo_fixed(str1, str2, words_dict, lowercase=True):
    """ Check if typo is fixed.
        return 0: no other case is found
        return 1: word was not in dict before (missspelled)
        return 2: word with swapped first letter (and other changes depending on edit distance)
    """
    # detects number errors (dreher,tippfehler), skipps increments. Only works if skip_no_alpha is false 
    if str1.isdigit() and str2.isdigit() and not is_increment(int(str1),int(str2)):
        return 3

    if is_first_letter_caseswitch(str1,str2):
        return 2

    if lowercase:
        str1=str1.lower()
        str2=str2.lower()

    # checks if str1 is not in dict but str2 is
    if (not word_in_dict(str1, words_dict) and word_in_dict(str2, words_dict)):
        return 1
        
    return 0

In [7]:
def get_typo_type(str1, str2, words_dict, upper_lev_distance=2, skip_no_alpha=False):
    words, levenshtein_dists = get_words_and_dists(
        str1, str2, skip_no_alpha)
    typo_lst = []
    for i in range(len(levenshtein_dists)):  # only loops if dists are found (word counts are equal)
        # only uses distances >0 <=2
        if(levenshtein_dists[i] > 0 and levenshtein_dists[i] <= upper_lev_distance):
            typo_lst.append(is_typo_fixed(
                words[0][i], words[1][i], words_dict))
        # else:  # appends None if dist is <0 or >2
        #     typo_lst.append(None)
    return typo_lst


In [8]:
my_file = open("../../../words_alpha.txt", "r")
words_dict=set(my_file.read().split("\n"))

testcase1 = ["Hier sind kkeine Fheler", "Hier sind keine Fehler"]
testcase1_en = ["There are nno erorrs", "There are no errors"]
typo_lst = get_typo_type(testcase1_en[0], testcase1_en[1], words_dict)
print(typo_lst)

[1, 1]


In [9]:
def typo_check(str1, str2, words_dict, upper_lev_distance=2, skip_no_alpha=False):
    """Return True if typo
       Return False if no typo
    """
    typo_lst = get_typo_type(str1, str2, words_dict, upper_lev_distance, skip_no_alpha)
    if len(typo_lst) == 0:
        return None
    for typo_type in typo_lst:
        if typo_type > 0:  # 1 is previous not in dict, current is in dict, 2 case switch on first letter
            return True
    return False

def typo_check_pandas(row, words_dict, upper_lev_distance=2, skip_no_alpha=False):
    """Return True if typo
       Return False if no typo
    """
    typo_lst = get_typo_type(row["currentValue"], row["previousValue"], words_dict, upper_lev_distance, skip_no_alpha)
    if len(typo_lst) == 0:
        return None
    for typo_type in typo_lst:
        if typo_type > 0:  # 1 is previous not in dict, current is in dict, 2 case switch on first letter
            return True
    return False

In [85]:
def swear_check(str1, str2, words_dict, lowercase=True):
    """ Check if swear got added or removed.
        Input:
            str1: prev string
            str2: curr string
        Output:
        prev false , curr true : 1 (swear word added)
        prev true , curr false : 2 (swear word removed)
    """
    if lowercase:
        str1=str1.lower()
        str2=str2.lower()

    str1_lst=str1.split()
    str2_lst=str2.split()

    prev_swear=False
    curr_swear=False
    for string in str1_lst:
        if word_in_dict(string, words_dict):
            prev_swear=True
            break

    for string in str2_lst:
        if word_in_dict(string, words_dict):
            curr_swear=True
            break

    if (not prev_swear and curr_swear):
        # swear word added
        return 1
    if (prev_swear and not curr_swear):
        # swear word removed
        return 2
    if (prev_swear and  curr_swear):
        # swear word in both
        return 3
    if (not prev_swear and not curr_swear):
        # swear word in none
        return 0

def swear_check_pandas(row, words_dict, lowercase=True):
    return swear_check(row["currentValue"], row["previousValue"], words_dict, lowercase=lowercase)


In [65]:
import numpy as np

def timedelta_to_seconds(arr): return arr.total_seconds()
def timedelta_to_hours(arr): return arr.total_seconds()/60/60
def timedelta_to_days(arr): return arr.total_seconds()/60/60/24
def timedelta_to_days_int(arr): return arr.days

timedelta_to_seconds = np.vectorize(timedelta_to_seconds)
timedelta_to_hours = np.vectorize(timedelta_to_hours)
timedelta_to_days = np.vectorize(timedelta_to_days)
timedelta_to_days_int = np.vectorize(timedelta_to_days_int)

In [12]:
input_data = Path.home()/"output-infobox"
inp = list(input_data.rglob('*.json'))
files = [x for x in inp if x.is_file()]
print("number of files:", len(files))

number of files: 586


In [88]:
# english words dict
my_file = open("../../../words_alpha.txt", "r")
words_dict=set(my_file.read().split("\n"))

# swear words dict
swear_file = open("../../../words_swear.txt", "r")
swear_dict = set(swear_file.read().split("\n"))
swear_dict.remove("nazi") # nazi is mostly no swear word in the context

In [90]:
number_of_files = 3
num_edits = 0
num_change_tuples = 0
timedeltas_all = []
timedeltas_typo = []
timedeltas_levensh = []
timedeltas_swear_added = []
timedeltas_swear_removed = []
for file in tqdm(files[:number_of_files]):
    change_tuples = []
    with open(file, 'r', encoding='utf-8') as f:
        for jsonObj in f:
            single_edit = json.loads(jsonObj)
            num_edits += 1
            title = single_edit['pageTitle']
            pageID = single_edit['pageID']
            key = single_edit['key']
            template = single_edit['template'] if 'template' in single_edit.keys(
            ) else None
            changes = single_edit['changes']
            validFrom = single_edit['validFrom']
            revisionId = single_edit['revisionId']
            attributes = single_edit['attributes'] if 'attributes' in single_edit.keys(
            ) else None
            user_name = single_edit['user']['username'] if 'username' in single_edit['user'].keys(
            ) else None
            user_id = single_edit['user']['id'] if 'id' in single_edit['user'].keys(
            ) else None
            user_ip = single_edit['user']['ip'] if 'ip' in single_edit['user'].keys(
            ) else None
            for change in changes:
                name = change['property']['name']
                current_value = change['currentValue'] if 'currentValue' in change.keys(
                ) else None
                previous_value = change['previousValue'] if 'previousValue' in change.keys(
                ) else None
                validTo = change['valueValidTo'] if 'valueValidTo' in change.keys(
                ) else None
                change_tuples.append((title, pageID, key, template, name, previous_value,
                                      current_value, validFrom, validTo, revisionId, user_name, user_id, user_ip, attributes))

    data = pd.DataFrame(change_tuples, columns=['pageTitle', 'pageID', 'key', 'template', 'name', 'previousValue',
                                                'currentValue', 'validFrom', 'validTo', 'revisionId', 'user_name', 'user_id', 'user_ip', 'attributes'])
    num_change_tuples += len(data)
    data = data[(data["currentValue"] != "") & (~data["currentValue"].isnull())]
    data = data[(data["previousValue"] != "") & (~data["previousValue"].isnull())]
    data = data[(data["validTo"] != "") & (~data["validTo"].isnull())]

    data['validFrom'] = pd.to_datetime(data['validFrom'])
    data['validTo'] = pd.to_datetime(data['validTo'])

    timedeltas_all.extend(data["validTo"]-data['validFrom'])

    data["isTypo"] = data.apply(lambda row: typo_check_pandas(
        row, words_dict, upper_lev_distance=2, skip_no_alpha=True), axis=1)

    timedeltas_typo.extend(data[data["isTypo"] == True]
                           ["validTo"]-data[data["isTypo"] == True]['validFrom'])
    timedeltas_levensh.extend(data[~data["isTypo"].isnull()]
                              ["validTo"]-data[~data["isTypo"].isnull()]['validFrom'])

    data["swear"] = data.apply(lambda row: swear_check_pandas(row, swear_dict), axis=1)
    timedeltas_swear_added.extend(
        data[data["swear"] == 1]["validTo"]-data[data["swear"] == 1]['validFrom'])
    timedeltas_swear_removed.extend(
        data[data["swear"] == 2]["validTo"]-data[data["swear"] == 2]['validFrom'])


print("Read data:")
print("Number of edits:", num_edits)
print("Number of change tuples:", num_change_tuples)


100%|██████████| 3/3 [01:34<00:00, 31.52s/it]

Read data:
Number of edits: 385243
Number of change tuples: 1592034





In [79]:
print("data")
print("number of all changes:", num_change_tuples)
print("number of all updates:", len(timedeltas_all))
print("percent of updates:", len(timedeltas_all)/num_change_tuples*100)
print("\nmatching levenshtein dist of 2")
print("number of matching levenshtein dist of 2:", len(timedeltas_levensh))
print("percent of updates matching levenshtein dist of 2:", len(timedeltas_levensh)/len(timedeltas_all)*100)
print("typos")
print("\nnumber of fixed typos:", len(timedeltas_typo))
print("percent of updates with typo:", len(timedeltas_typo)/len(timedeltas_all)*100)

data
number of all changes: 1592034
number of all updates: 338112
percent of updates: 21.237737385005598

matching levenshtein dist of 2
number of matching levenshtein dist of 2: 22702
percent of updates matching levenshtein dist of 2: 6.714343176225629
typos

number of fixed typos: 8393
percent of updates with typo: 2.4823135529055462


## Time to Change

In [80]:
print("ALL UPDATES")
print("median of timedelta of in days:",
      np.median(timedelta_to_days(timedeltas_all)))
print("mean timedelta of in days:",
      timedelta_to_days(timedeltas_all).mean())
print("std of timedelta of in days:",
      timedelta_to_days(timedeltas_all).std())


ALL UPDATES
median of timedelta of in days: 41.953298611111116
mean timedelta of in days: 335.78714151870184
std of timedelta of in days: 633.6958786241099


In [81]:
print("LEVENSHTEIN OF 2")
print("median of timedelta in days:",
      np.median(timedelta_to_days(timedeltas_levensh)))
print("mean timedelta in days:",
      timedelta_to_days(timedeltas_levensh).mean())
print("std of timedelta of in days:",
      timedelta_to_days(timedeltas_levensh).std())

LEVENSHTEIN OF 2
median of timedelta in days: 110.05748842592592
mean timedelta in days: 430.66782296418245
std of timedelta of in days: 701.3017121699444


In [82]:
print("TYPOS")
print("median of timedelta in days:",
      np.median(timedelta_to_days(timedeltas_typo)))
print("mean timedelta in days:",
      timedelta_to_days(timedeltas_typo).mean())
print("std of timedelta of in days:",
      timedelta_to_days(timedeltas_typo).std())

TYPOS
median of timedelta in days: 142.15240740740742
mean timedelta in days: 479.8548766277895
std of timedelta of in days: 752.7216704589307


In [91]:
data[data["isTypo"]==True]

Unnamed: 0,pageTitle,pageID,key,template,name,previousValue,currentValue,validFrom,validTo,revisionId,user_name,user_id,user_ip,attributes,isTypo,swear
177,Nelson Cruz,6159471,149830852-0,infobox mlb player,stat1label,[[Batting Average]],[[Batting average]],2008-10-04 05:01:13+00:00,2015-02-11 02:57:35+00:00,242905092,Jackal4,1776444.0,,"{'stat2label': '[[Home run]]s', 'image': 'Repl...",True,0
181,Nelson Cruz,6159471,149830852-0,infobox mlb player,position,Right Fielder,Right fielder,2008-10-04 05:01:13+00:00,2009-01-17 02:17:35+00:00,242905092,Jackal4,1776444.0,,"{'stat2label': '[[Home run]]s', 'image': 'Repl...",True,0
228,Nelson Cruz,6159471,149830852-0,infobox mlb player,position,[[Right fielder]],[[Right Fielder]],2010-03-30 23:17:01+00:00,2013-05-17 17:18:08+00:00,353050611,,,24.208.75.32,"{'stat2label': '[[Home run]]s', 'image': '0007...",True,0
406,Nelson Cruz,6159471,149830852-0,infobox mlb player,position,[[Right Fielder]],[[Right fielder]],2013-05-17 17:18:08+00:00,2013-06-04 23:21:54+00:00,555539361,Trut-h-urts man,8334148.0,,"{'stat2label': '[[Hit (baseball)|Hit]]s', 'ima...",True,0
438,Nelson Cruz,6159471,149830852-0,infobox mlb player,bats,right,Right,2013-09-21 02:19:49+00:00,2015-02-11 02:57:35+00:00,573859300,,,69.117.59.46,"{'stat2label': '[[Hit (baseball)|Hit]]s', 'ima...",True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560049,Sunrise over a Sea of Blood,6135665,66094180-0,infobox album,reviews,* [[Jesus Freak Hideout]] {{rating-5|3.5}} <re...,* [[Jesus Freak Hideout]] {{Rating|3.5|5}} <re...,2008-09-21 02:21:00+00:00,2012-10-04 18:39:10+00:00,239914276,DinoBot2,7128788.0,,"{'cover': 'SOASOB_Better.jpg', 'reviews': '* [...",True,0
560229,Ciaran Donnelly,6174068,75276638-0,infobox football biography 2,dateofbirth,{{birth date and age|1984|4|2|df=yes}},{{Birth date and age|1984|4|2|df=yes}},2010-09-24 04:22:40+00:00,2012-11-26 21:51:34+00:00,386682567,Rich Farmbrough,82835.0,,"{'cityofbirth': '[[Blackpool]]', 'youthclubs1'...",True,0
560531,Sideshow (The Adventures of Batman & Robin),6188523,66733552-0,infobox television episode,next,[[A Bullet For Bullock]],[[A Bullet for Bullock]],2007-03-11 22:08:59+00:00,2007-08-14 22:33:45+00:00,114394184,Mellum,45569.0,,"{'next': '[[A Bullet for Bullock]]', 'image': ...",True,0
560736,Castellabate,6125780,101811542-0,infobox cityit,population_as_of,"[[december 31]], [[2004]]","December 31, 2004",2009-09-20 19:42:22+00:00,2011-10-08 12:21:48+00:00,315150259,Plasticspork,10068830.0,,"{'official_name': 'Comune di Castellabate', 'p...",True,0


## Swear words

In [None]:
def check_swear(str1, str2, words_dict, lowercase=True):
    """ Check if swear got added or removed.
        Input:
            str1: prev string
            str2: curr string
        Output:
        prev false , curr true : 1 (swear word added)
        prev true , curr false : 2 (swear word removed)
    """
    if lowercase:
        str1=str1.lower()
        str2=str2.lower()

    str1_lst=str1.split()
    str2_lst=str2.split()

    prev_swear=False
    curr_swear=False
    for string in str1_lst:
        if word_in_dict(string, words_dict):
            prev_swear=True
            break

    for string in str2_lst:
        if word_in_dict(string, words_dict):
            curr_swear=True
            break

    if (not prev_swear and curr_swear):
        # swear word added
        return 1
    if (prev_swear and not curr_swear):
        # swear word removed
        return 2
    if (prev_swear and  curr_swear):
        # swear word in both
        return 3
    if (not prev_swear and not curr_swear):
        # swear word in none
        return 0

In [None]:
swear_file = open("../../../words_swear.txt", "r")
swear_dict = set(swear_file.read().split("\n"))
swear_dict.remove("nazi") # nazi is mostly no swear word in the context

def is_not_empty_or_none(input):
    return input is not None and input is not ""


swear_lst = []
for i in tqdm(range(len(change_tuples))):
    if(is_not_empty_or_none(change_tuples[i][5]) and is_not_empty_or_none(change_tuples[i][6])):
        swear_lst.append(check_swear(
            change_tuples[i][5], change_tuples[i][6], swear_dict))
    else:
        swear_lst.append(None)


counts_swear = {"Swearwords added": 0,
                "Swearwords removed": 0,
                "Swearwords not touched": 0,
                "Swearwords not found": 0,
                "create or delete (skipped)": 0}
for test in swear_lst:
    if test is 1:
        counts_swear["Swearwords added"] += 1
    if test is 2:
        counts_swear["Swearwords removed"] += 1
    if test is 3:
        counts_swear["Swearwords not touched"] += 1
    if test is 0:
        counts_swear["Swearwords not found"] += 1
    if test is None:
        # prev or curr is None
        counts_swear["create or delete (skipped)"] += 1
print(counts_swear)

idx_swear = [[], []]
for i in range(len(swear_lst)):
    if swear_lst[i] == 1:
        idx_swear[0].append(i)
    if swear_lst[i] == 2:
        idx_swear[1].append(i)


In [None]:
print("Swearwords added:", counts_swear["Swearwords added"])
print("Swearwords removed:", counts_swear["Swearwords removed"])
print("Swearwords not touched:", counts_swear["Swearwords not touched"])
print("Swearwords not found:", counts_swear["Swearwords not found"])
print("create or delete (skipped):", counts_swear["create or delete (skipped)"])
edit_count = counts_swear["Swearwords added"]+counts_swear["Swearwords removed"] + \
    counts_swear["Swearwords not touched"]+counts_swear["Swearwords not found"]
print("Toal tuples (only updates without creations/deletions):", edit_count)
print("Toal tuples:", edit_count+counts_swear["create or delete (skipped)"])
print("Percentage of swear words in edits (only updates without creations/deletions) added and removed:",
      counts_swear["Swearwords added"]/edit_count, counts_swear["Swearwords removed"]/edit_count)


## Swear words added

In [None]:
time_deltas_swear = timedeltas_between_changes(idx_swear[0], change_tuples)
time_deltas_swear = np.array(time_deltas_swear)
print("Average Time to change for a typofix")
print("Median time in days", np.median(timedelta_to_days(time_deltas_swear)))
print("Median time in hours", np.median(timedelta_to_hours(time_deltas_swear)))
print("Median time in seconds", np.median(timedelta_to_seconds(time_deltas_swear)))
print("timedelta mean and std in days:", np.mean(
    timedelta_to_days(time_deltas_swear)), np.std(timedelta_to_days(time_deltas_swear)))
print("timedelta mean:", str(time_deltas_swear.mean()))

In [None]:
def removeOutliers(data, percentile):
    lower_quartile = np.percentile(data, percentile)
    upper_quartile = np.percentile(data, 100-percentile)
    if lower_quartile == upper_quartile:
        return data
    print(lower_quartile, upper_quartile)
    data = data[data >= lower_quartile]
    data = data[data < upper_quartile]
    return data

In [None]:
data.iloc[idx_swear[0]].head(10)

## Swear words removed

In [None]:
data.iloc[idx_swear[1]].head(10)