In [2]:
import json
import os

In [3]:
exp_dir = "../../../logs/en/analysis/version_21"
errors = json.load(open(os.path.join(exp_dir, "errors.json")))
replacements = json.load(open(os.path.join(exp_dir, "replacements.json")))
error_types = ["replace", "insert", "delete"]

In [4]:
# print how many errors there are for each dataset, and how many of them are on words that occur only once in the dataset
for dataset in errors:
    occurs_max_once = dict()
    occurs_max_once["total"] = len(errors[dataset])
    for word, data in errors[dataset].items():
        if data["occurrences"] <= 1:
            error_type = list(data.keys())[1]
            if error_type not in occurs_max_once:
                occurs_max_once[error_type] = 1
            else:
                occurs_max_once[error_type] += 1
    print(dataset, occurs_max_once)


test_za.txt {'total': 3977, 'replace': 2273, 'occurrences': 111, 'insert': 27, 'delete': 79}
test_sg.txt {'total': 2789, 'replace': 1648, 'delete': 81, 'occurrences': 110, 'insert': 28}
test_in.txt {'total': 12360, 'replace': 6990, 'occurrences': 310, 'delete': 242, 'insert': 72}
test_au.txt {'total': 5489, 'replace': 3273, 'insert': 38, 'occurrences': 146, 'delete': 136}
test_hk.txt {'total': 2704, 'replace': 1429, 'delete': 54, 'occurrences': 84, 'insert': 14}
test_ie.txt {'total': 5104, 'replace': 2973, 'occurrences': 146, 'delete': 125, 'insert': 29}
test_uk.txt {'total': 10017, 'replace': 5497, 'delete': 186, 'occurrences': 279, 'insert': 51}
test_ph.txt {'total': 3438, 'occurrences': 114, 'replace': 2050, 'insert': 24, 'delete': 91}
test_de.txt {'total': 4255, 'replace': 2834, 'delete': 73, 'occurrences': 117, 'insert': 32}
test_ni.txt {'total': 3353, 'replace': 2025, 'delete': 102, 'occurrences': 91, 'insert': 25}
test_us.txt {'total': 24638, 'replace': 13857, 'occurrences': 587

In [5]:
def max_occurrences(errors, error_type, n):
    """Return the words and number of errors of the n words with the most occurrences of the given error type."""
    words = []
    for word, data in errors.items():
        if error_type in data:
            words.append((word, data[error_type], data["occurrences"]))
    words.sort(key=lambda x: x[1], reverse=True)
    return words[:n]

In [6]:
for dataset in errors:
    print(dataset)
    for error_type in error_types:
        print(error_type, max_occurrences(errors[dataset], error_type, 3))
    print()

test_za.txt
replace [('dont', 112, 115), ('its', 91, 259), ('im', 81, 82)]
insert [('the', 84, 5704), ('a', 37, 1834), ('and', 17, 1810)]
delete [('the', 25, 5704), ('is', 19, 1361), ('a', 12, 1834)]

test_sg.txt
replace [('the', 65, 2397), ('a', 46, 754), ('in', 27, 677)]
insert [('the', 45, 2397), ('a', 18, 754), ('of', 10, 928)]
delete [('the', 17, 2397), ('a', 13, 754), ('in', 8, 677)]

test_in.txt
replace [('the', 195, 13919), ('a', 173, 4272), ('its', 114, 528)]
insert [('the', 190, 13919), ('a', 122, 4272), ('and', 42, 4331)]
delete [('the', 111, 13919), ('a', 64, 4272), ('is', 35, 3859)]

test_au.txt
replace [('the', 113, 6848), ('its', 87, 312), ('dont', 87, 91)]
insert [('the', 96, 6848), ('a', 52, 2478), ('in', 14, 2062)]
delete [('the', 67, 6848), ('a', 35, 2478), ('is', 25, 1886)]

test_hk.txt
replace [('the', 57, 2611), ('its', 48, 115), ('dont', 40, 40)]
insert [('the', 48, 2611), ('a', 30, 917), ('it', 13, 397)]
delete [('the', 26, 2611), ('a', 13, 917), ('is', 12, 746)

In [7]:
# how often does each error type occur? Filter out words that occur max once in the dataset
for dataset in errors:
    error_distribution = dict()
    error_distribution["total"] = len(errors[dataset])
    for word, data in errors[dataset].items():
        if data["occurrences"] <= 1:
            continue
        for error_type in data:
            if error_type == "occurrences":
                continue
            if error_type not in error_distribution:
                error_distribution[error_type] = data[error_type]
            else:
                error_distribution[error_type] += data[error_type]
    print(dataset, error_distribution)    

test_za.txt {'total': 3977, 'replace': 2875, 'insert': 497, 'delete': 367}
test_sg.txt {'total': 2789, 'replace': 1452, 'delete': 212, 'insert': 294}
test_in.txt {'total': 12360, 'replace': 9319, 'insert': 1702, 'delete': 1231}
test_au.txt {'total': 5489, 'replace': 3622, 'insert': 691, 'delete': 585}
test_hk.txt {'total': 2704, 'replace': 1927, 'insert': 331, 'delete': 206}
test_ie.txt {'total': 5104, 'replace': 3459, 'insert': 604, 'delete': 514}
test_uk.txt {'total': 10017, 'replace': 9177, 'insert': 1719, 'delete': 1162}
test_ph.txt {'total': 3438, 'replace': 1922, 'delete': 292, 'insert': 393}
test_de.txt {'total': 4255, 'insert': 346, 'delete': 307, 'replace': 1555}
test_ni.txt {'total': 3353, 'insert': 316, 'replace': 1780, 'delete': 273}
test_us.txt {'total': 24638, 'replace': 26060, 'delete': 3078, 'insert': 4527}
test_ca.txt {'total': 5165, 'replace': 2902, 'delete': 446, 'insert': 587}
test_nz.txt {'total': 4177, 'replace': 4042, 'delete': 375, 'insert': 498}
test_sc.txt {'t

In [8]:
# how often does each error type occur? Filter out words that occur max once in the dataset
for dataset in errors:
    error_distribution = dict()
    error_distribution["total"] = len(errors[dataset])
    for word, data in errors[dataset].items():
        for error_type in data:
            if error_type == "occurrences":
                continue
            if error_type not in error_distribution:
                error_distribution[error_type] = data[error_type]
            else:
                error_distribution[error_type] += data[error_type]
    print(dataset, error_distribution)    

test_za.txt {'total': 3977, 'replace': 5150, 'insert': 643, 'delete': 446}
test_sg.txt {'total': 2789, 'replace': 3105, 'delete': 293, 'insert': 438}
test_in.txt {'total': 12360, 'replace': 16319, 'insert': 2104, 'delete': 1475}
test_au.txt {'total': 5489, 'replace': 6899, 'insert': 887, 'delete': 722}
test_hk.txt {'total': 2704, 'replace': 3356, 'insert': 435, 'delete': 260}
test_ie.txt {'total': 5104, 'replace': 6432, 'insert': 787, 'delete': 639}
test_uk.txt {'total': 10017, 'replace': 14681, 'insert': 2068, 'delete': 1349}
test_ph.txt {'total': 3438, 'insert': 540, 'replace': 3973, 'delete': 384}
test_de.txt {'total': 4255, 'replace': 4393, 'insert': 500, 'delete': 380}
test_ni.txt {'total': 3353, 'insert': 435, 'replace': 3808, 'delete': 375}
test_us.txt {'total': 24638, 'replace': 39935, 'delete': 3504, 'insert': 5277}
test_ca.txt {'total': 5165, 'replace': 6097, 'delete': 570, 'insert': 764}
test_nz.txt {'total': 4177, 'replace': 6302, 'delete': 446, 'insert': 646}
test_sc.txt {

In [9]:
# how many replacements are there for each word?
for dataset in replacements:
    print("\n" + dataset)
    for word_ref, data in replacements[dataset].items():
        for word_pred, cnt in data.items():
            if cnt > 5:
                print(word_ref, word_pred, cnt)


test_za.txt
and in 8
its it's 80
a the 24
didnt didn't 50
the a 26
the their 10
i i'm 6
thats that's 56
this the 16
centre center 10
it it's 8
doesnt doesn't 12
ive i've 21
their the 15
in and 13
dont don't 112
whats what's 26
theres there's 16
lets let's 14
wouldnt wouldn't 11
im i'm 78
youll you'll 21
wasnt wasn't 16
youve you've 10
theyre they're 6
ill i'll 24
towns town's 7
cant can't 30
ones one's 16
youre you're 15
weve we've 6
shes she's 7
isnt isn't 8
wont won't 15
hes he's 8
id i'd 18
boys boy's 10
well we'll 6
bands band's 6
couldnt couldn't 11
film form 6
shouldnt shouldn't 6

test_sg.txt
dont don't 15
doesnt doesn't 6
a the 27
in and 15
its it's 14
their the 8
im i'm 8
theyre they're 6
bands band's 6
and in 12
didnt didn't 7
theres there's 7
the a 29
to the 6

test_in.txt
hes he's 17
the that 6
the this 13
the a 56
the their 16
the these 7
and in 21
and an 6
youre you're 38
youre are 11
didnt didn't 57
this these 12
this the 17
this his 6
albums album's 6
couldnt couldn't 

In [11]:
def occurrences(errors, error_type):
    """Return the words and number of errors of the n words with the most occurrences of the given error type."""
    words = []
    for word, data in errors.items():
        if error_type in data:
            if data["occurrences"] > 5:
                words.append((word, data[error_type], data["occurrences"]))
    words.sort(key=lambda x: x[1], reverse=True)
    return words


for word in occurrences(errors["test_sc.txt"], "replace"):
    print(word)

('the', 1261, 10504)
('and', 751, 3321)
('in', 699, 2957)
('a', 628, 3253)
('is', 442, 2639)
('of', 439, 3721)
('to', 424, 3020)
('it', 244, 1569)
('its', 242, 439)
('for', 242, 1228)
('this', 232, 858)
('are', 225, 970)
('were', 210, 692)
('on', 193, 872)
('he', 192, 1668)
('with', 182, 901)
('his', 181, 965)
('at', 174, 645)
('dont', 172, 175)
('an', 170, 477)
('was', 161, 2335)
('they', 158, 521)
('from', 151, 616)
('their', 137, 320)
('that', 130, 836)
('im', 129, 131)
('as', 124, 876)
('by', 122, 790)
('there', 98, 450)
('had', 97, 488)
('one', 90, 394)
('her', 89, 406)
('have', 88, 536)
('i', 85, 934)
('can', 82, 319)
('be', 82, 596)
('these', 81, 256)
('you', 81, 776)
('she', 79, 502)
('some', 74, 265)
('has', 73, 559)
('not', 68, 437)
('or', 66, 297)
('many', 66, 203)
('no', 64, 251)
('also', 63, 762)
('first', 61, 295)
('would', 60, 188)
('but', 60, 355)
('didnt', 60, 64)
('will', 60, 177)
('been', 59, 318)
('two', 59, 349)
('thats', 57, 58)
('into', 57, 208)
('when', 56, 196)