In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm
import Levenshtein
import matplotlib.pyplot as plt
import fastDamerauLevenshtein

In [None]:
# splits string in words
def split_strings(str1, str2):
    lst = [str1.split()]
    lst.append(str2.split())
    return lst

# checks if wordcount in both strings is equal
def same_wordcounts(lst1, lst2):
    return (len(lst1) == len(lst2))

# deletes non alphabetical characters from string
def skip_no_alpha(string):
    only_alpha = ""
    for char in string:
        if char.isalpha():
            only_alpha += char
    return only_alpha

# checks in numbers are increments
def is_increment(nr1, nr2):
    return (nr1+1 == nr2 or nr1-1 == nr2)

def is_first_letter_caseswitch(str1,str2):
    return (str1[0].isupper() and str2[0].islower() or str1[0].islower() and str2[0].isupper())


In [None]:
def get_levenshtein_dists(str1, str2):
    words = split_strings(str1, str2)
    lst = []
    if same_wordcounts(words[0], words[1]):
        for i in range(len(words[0])):
            lst.append(int(fastDamerauLevenshtein.damerauLevenshtein(
                words[0][i], words[1][i], similarity=False)))
    return lst, words


def get_levenshtein_dists_without_nr(str1, str2):
    words = split_strings(str1, str2)
    lst = []
    if same_wordcounts(words[0], words[1]):
        for i in range(len(words[0])):
            lst.append(int(fastDamerauLevenshtein.damerauLevenshtein(skip_no_alpha(
                words[0][i]), skip_no_alpha(words[1][i]), similarity=False)))
    return lst, words


In [None]:
testcase1 = ["Hier sind kkeine Fheler", "Hier sind keine Fehler"]
print(get_levenshtein_dists(testcase1[0],testcase1[1]))
print(get_levenshtein_dists_without_nr(testcase1[0],testcase1[1]))

In [None]:
testcase2 = ["9 tests", "10 tests"]
print(get_levenshtein_dists(testcase2[0],testcase2[1]))
print(get_levenshtein_dists_without_nr(testcase2[0],testcase2[1]))

In [None]:
my_file = open("../../../words_alpha.txt", "r")
words_dict=set(my_file.read().split())

In [None]:
def word_in_dict(str1, words_dict):
    return str1 in words_dict


def is_typo_fixed(str1, str2, words_dict, lowercase=True):
    """ Check if typo is fixed.
        return 1: word was not in dict before (misspelled)
        return 2: word with swapped first letter (and other changes depending on edit distance)
    """
    if is_first_letter_caseswitch(str1,str2):
        return 2

    if lowercase:
        str1=str1.lower()
        str2=str2.lower()

    if (not word_in_dict(str1, words_dict) and word_in_dict(str2, words_dict)):
        return 1
        
    return 999

In [None]:
def check_for_typo_type(str1, str2, words_dict):
    levenshtein_dists, words = get_levenshtein_dists(
        str1, str2)
    typo_fixed_lst = []
    for i in range(len(levenshtein_dists)):
        # if(levenshtein_dists[i] > 0)
        if(levenshtein_dists[i] > 0 and levenshtein_dists[i] <= 2):
            typo_fixed_lst.append(is_typo_fixed(
                words[0][i], words[1][i], words_dict))
    return typo_fixed_lst

In [None]:
testcase1 = ["Hier sind kkeine Fheler", "Hier sind keine Fehler"]
testcase1_en = ["There are nno erorrs", "There are no errors"]
typo_lst = check_for_typo_type(testcase1_en[0], testcase1_en[1], words_dict)
print(typo_lst)

In [None]:
def typo_check(str1, str2, words_dict):
    """Return True if typo
       Return False if no typo
       Return None if no edit distance is found or edit distance is >2
    """
    typo_lst = check_for_typo_type(str1, str2, words_dict)
    if (len(typo_lst) == 0):
        return None
    for typo_type in typo_lst:
        if typo_type > 2:  # 1 is previous not in dict, current is in dict, 2 case switch on first letter
            return False
    return True

In [None]:
input_data = Path("../../matched-infoboxes-extracted/")
inp = list(input_data.rglob('*.json'))
files = [x for x in inp if x.is_file()]
len(files) 

In [None]:
num_edits = 0
change_tuples = []
for file in tqdm(files[:20]):
    with open(file, 'r', encoding='utf-8') as f:
        for jsonObj in f:
            single_edit = json.loads(jsonObj)
            num_edits += 1
            title = single_edit['pageTitle']
            pageID = single_edit['pageID']
            key = single_edit['key']
            template = single_edit['template'] if 'template' in single_edit.keys(
            ) else None
            changes = single_edit['changes']
            timestamp = single_edit['validFrom']
            revisionId = single_edit['revisionId']
            attributes = single_edit['attributes'] if 'attributes' in single_edit.keys(
            ) else None
            # print(single_edit['user'])
            user_name = single_edit['user']['username'] if 'username' in single_edit['user'].keys(
            ) else None
            user_id = single_edit['user']['id'] if 'id' in single_edit['user'].keys(
            ) else None
            user_ip = single_edit['user']['ip'] if 'ip' in single_edit['user'].keys(
            ) else None
            for change in changes:
                name = change['property']['name']
                current_value = change['currentValue'] if 'currentValue' in change.keys(
                ) else None
                previous_value = change['previousValue'] if 'previousValue' in change.keys(
                ) else None
                change_tuples.append((title, pageID, key, template, name, previous_value,
                                     current_value, timestamp, revisionId, user_name, user_id, user_ip, attributes))
print("Number of edits:", num_edits)
print("Number of change tuples:", len(change_tuples))

In [None]:
my_file = open("../../../words_alpha.txt", "r")
words_dict=set(my_file.read().split())

In [None]:
typo_lst = []
for i in tqdm(range(len(change_tuples))):
    if(change_tuples[i][5] is not None and change_tuples[i][6] is not None):
        typo_lst.append(typo_check(
            change_tuples[i][5], change_tuples[i][6], words_dict))
    else:
        typo_lst.append(None)

In [None]:
counts = [0, 0, 0]
for test in typo_lst:
    if test is True:
        counts[0] = counts[0]+1
    if test is False:
        counts[1] = counts[1]+1
    if test is None:
        counts[2] = counts[2]+1
print(counts)
print(counts[0]+counts[1]+counts[2])

In [None]:
idx = []
for i in range(len(typo_lst)):
    if typo_lst[i] == True:
        idx.append(i)

## Dataframe

In [None]:
data = pd.DataFrame(change_tuples, columns=['pageTitle', 'pageID', 'key', 'template', 'name', 'previous_value', 'current_value', 'timestamp', 'revisionId', 'user_name', 'user_id', 'user_ip', 'attributes'])
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [None]:
data.iloc[idx].head(50)

## Swear words

In [None]:
def check_swear(str1, str2, words_dict, lowercase=True):
    """ Check if swear got added or removed.
        Input:
            str1: prev string
            str2: curr string
        Output:
        prev false , curr true : 1 (swear word added)
        prev true , curr false : 2 (swear word removed)
    """
    if lowercase:
        str1=str1.lower()
        str2=str2.lower()

    str1_lst=str1.split()
    str2_lst=str2.split()

    prev_swear=False
    curr_swear=False
    for string in str1_lst:
        if word_in_dict(string, words_dict):
            prev_swear=True
            break

    for string in str2_lst:
        if word_in_dict(string, words_dict):
            curr_swear=True
            break

    if (not prev_swear and curr_swear):
        # swear word added
        return 1
    if (prev_swear and not curr_swear):
        # swear word removed
        return 2
    if (prev_swear and  curr_swear):
        # swear word in both
        return 3
    if (not prev_swear and not curr_swear):
        # swear word in none
        return 0

In [None]:
swear_file = open("../../../words_swear.txt", "r")
swear_dict = set(swear_file.read().split())

swear_lst = []
for i in tqdm(range(len(change_tuples))):
    if(change_tuples[i][5] is not None and change_tuples[i][6] is not None):
        swear_lst.append(check_swear(
            change_tuples[i][5], change_tuples[i][6], swear_dict))
    else:
        swear_lst.append(None)


counts_swear = [0, 0, 0, 0, 0]
for test in swear_lst:
    if test is 1:
        counts_swear[0] = counts_swear[0]+1
    if test is 2:
        counts_swear[1] = counts_swear[1]+1
    if test is 3:
        counts_swear[2] = counts_swear[2]+1
    if test is 0:
        counts_swear[3] = counts_swear[3]+1
    if test is None:
        # prev or curr is None
        counts_swear[4] = counts_swear[4]+1
print(counts_swear)
print(counts_swear[0]+counts_swear[1]+counts_swear[2]+counts_swear[3])

idx_swear = [[], []]
for i in range(len(swear_lst)):
    if swear_lst[i] == 1:
        idx_swear[0].append(i)
    if swear_lst[i] == 2:
        idx_swear[1].append(i)


In [None]:
print("Swear words added:", counts_swear[0])
print("Swear words removed:", counts_swear[1])
print("Swear words not touched:", counts_swear[2])
print("Swear words not found:", counts_swear[3])
print("create or delete (skipped):", counts_swear[4])
edit_count = counts_swear[0]+counts_swear[1]+counts_swear[2]+counts_swear[3]
print("Toal words (without create/delete):", edit_count)
print("Percentage of swear words in edits added and removed:",
      counts_swear[0]/edit_count, counts_swear[1]/edit_count)


## Swear words added

In [None]:
# todo: how handle empty strings?
data.iloc[idx_swear[0]].head(50)

## Swear words removed

In [None]:
data.iloc[idx_swear[1]].head(50)