In [2]:
import json
import os

In [3]:
exp_dir = "../../../logs/en/analysis/version_22"
errors = json.load(open(os.path.join(exp_dir, "errors.json")))
replacements = json.load(open(os.path.join(exp_dir, "replacements.json")))
error_types = ["replace", "insert", "delete"]

In [4]:
# print how many errors there are for each dataset, and how many of them are on words that occur only once in the dataset
for dataset in errors:
    occurs_max_once = dict()
    occurs_max_once["total"] = len(errors[dataset])
    for word, data in errors[dataset].items():
        if data["occurrences"] <= 1:
            error_type = list(data.keys())[1]
            if error_type not in occurs_max_once:
                occurs_max_once[error_type] = 1
            else:
                occurs_max_once[error_type] += 1
    print(dataset, occurs_max_once)


test_za.txt {'total': 3955, 'replace': 2249, 'occurrences': 115, 'delete': 80, 'insert': 25}
test_sg.txt {'total': 2786, 'replace': 1666, 'delete': 76, 'occurrences': 104, 'insert': 27}
test_in.txt {'total': 12254, 'replace': 6954, 'occurrences': 302, 'delete': 239, 'insert': 68}
test_au.txt {'total': 5445, 'replace': 3252, 'insert': 40, 'occurrences': 144, 'delete': 138}
test_hk.txt {'total': 2686, 'replace': 1416, 'delete': 55, 'occurrences': 80, 'insert': 13}
test_ie.txt {'total': 5093, 'replace': 2987, 'occurrences': 139, 'delete': 123, 'insert': 30}
test_uk.txt {'total': 9933, 'replace': 5455, 'delete': 181, 'occurrences': 270, 'insert': 52}
test_ph.txt {'total': 3440, 'insert': 28, 'replace': 2056, 'occurrences': 115, 'delete': 86}
test_de.txt {'total': 4221, 'replace': 2799, 'occurrences': 118, 'insert': 31, 'delete': 77}
test_ni.txt {'total': 3302, 'replace': 2003, 'insert': 27, 'delete': 102, 'occurrences': 89}
test_us.txt {'total': 24579, 'replace': 13817, 'occurrences': 591,

In [5]:
def max_occurrences(errors, error_type, n):
    """Return the words and number of errors of the n words with the most occurrences of the given error type."""
    words = []
    for word, data in errors.items():
        if error_type in data:
            words.append((word, data[error_type], data["occurrences"]))
    words.sort(key=lambda x: x[1], reverse=True)
    return words[:n]

In [6]:
for dataset in errors:
    print(dataset)
    for error_type in error_types:
        print(error_type, max_occurrences(errors[dataset], error_type, 3))
    print()

test_za.txt
replace [('dont', 112, 115), ('its', 91, 259), ('im', 80, 82)]
insert [('the', 83, 5704), ('a', 36, 1834), ('and', 19, 1810)]
delete [('the', 22, 5704), ('is', 17, 1361), ('a', 11, 1834)]

test_sg.txt
replace [('the', 74, 2397), ('a', 41, 754), ('in', 26, 677)]
insert [('the', 35, 2397), ('a', 19, 754), ('of', 10, 928)]
delete [('the', 18, 2397), ('a', 12, 754), ('in', 7, 677)]

test_in.txt
replace [('the', 203, 13919), ('a', 151, 4272), ('its', 115, 528)]
insert [('the', 176, 13919), ('a', 119, 4272), ('and', 38, 4331)]
delete [('the', 106, 13919), ('a', 60, 4272), ('is', 34, 3859)]

test_au.txt
replace [('the', 115, 6848), ('its', 88, 312), ('dont', 86, 91)]
insert [('the', 96, 6848), ('a', 52, 2478), ('in', 16, 2062)]
delete [('the', 67, 6848), ('a', 36, 2478), ('is', 24, 1886)]

test_hk.txt
replace [('the', 63, 2611), ('its', 48, 115), ('a', 42, 917)]
insert [('the', 50, 2611), ('a', 34, 917), ('it', 14, 397)]
delete [('the', 26, 2611), ('a', 11, 917), ('is', 10, 746)]


In [7]:
# how often does each error type occur? Filter out words that occur max once in the dataset
for dataset in errors:
    error_distribution = dict()
    error_distribution["total"] = len(errors[dataset])
    for word, data in errors[dataset].items():
        if data["occurrences"] <= 1:
            continue
        for error_type in data:
            if error_type == "occurrences":
                continue
            if error_type not in error_distribution:
                error_distribution[error_type] = data[error_type]
            else:
                error_distribution[error_type] += data[error_type]
    print(dataset, error_distribution)    

test_za.txt {'total': 3955, 'replace': 2866, 'insert': 487, 'delete': 351}
test_sg.txt {'total': 2786, 'replace': 1445, 'delete': 210, 'insert': 277}
test_in.txt {'total': 12254, 'replace': 9078, 'insert': 1635, 'delete': 1160}
test_au.txt {'total': 5445, 'replace': 3568, 'insert': 691, 'delete': 573}
test_hk.txt {'total': 2686, 'replace': 1926, 'insert': 344, 'delete': 197}
test_ie.txt {'total': 5093, 'replace': 3457, 'insert': 609, 'delete': 502}
test_uk.txt {'total': 9933, 'replace': 9030, 'insert': 1697, 'delete': 1139}
test_ph.txt {'total': 3440, 'replace': 1928, 'delete': 295, 'insert': 383}
test_de.txt {'total': 4221, 'insert': 335, 'delete': 298, 'replace': 1532}
test_ni.txt {'total': 3302, 'insert': 318, 'replace': 1678, 'delete': 260}
test_us.txt {'total': 24579, 'replace': 26050, 'delete': 3056, 'insert': 4526}
test_ca.txt {'total': 5142, 'replace': 2879, 'delete': 443, 'insert': 583}
test_nz.txt {'total': 4172, 'replace': 4076, 'delete': 379, 'insert': 501}
test_sc.txt {'to

In [20]:
# how often does each error type occur? Filter out words that occur max once in the dataset
for dataset in errors:
    error_distribution = dict()
    error_distribution["total"] = len(errors[dataset])
    for word, data in errors[dataset].items():
        for error_type in data:
            if error_type == "occurrences":
                continue
            if error_type not in error_distribution:
                error_distribution[error_type] = data[error_type]
            else:
                error_distribution[error_type] += data[error_type]
    print(dataset, error_distribution)    

test_za.txt {'total': 3955, 'replace': 5118, 'insert': 636, 'delete': 431}
test_sg.txt {'total': 2786, 'replace': 3114, 'delete': 286, 'insert': 414}
test_in.txt {'total': 12254, 'replace': 16041, 'insert': 2024, 'delete': 1402}
test_au.txt {'total': 5445, 'replace': 6823, 'insert': 886, 'delete': 713}
test_hk.txt {'total': 2686, 'replace': 3342, 'insert': 442, 'delete': 252}
test_ie.txt {'total': 5093, 'replace': 6444, 'insert': 784, 'delete': 625}
test_uk.txt {'total': 9933, 'replace': 14492, 'insert': 2037, 'delete': 1321}
test_ph.txt {'total': 3440, 'insert': 533, 'replace': 3985, 'delete': 382}
test_de.txt {'total': 4221, 'replace': 4336, 'insert': 490, 'delete': 375}
test_ni.txt {'total': 3302, 'insert': 437, 'replace': 3684, 'delete': 362}
test_us.txt {'total': 24579, 'replace': 39885, 'delete': 3475, 'insert': 5276}
test_ca.txt {'total': 5142, 'replace': 6076, 'delete': 561, 'insert': 755}
test_nz.txt {'total': 4172, 'replace': 6324, 'delete': 458, 'insert': 650}
test_sc.txt {'

In [8]:
# how many replacements are there for each word?
for dataset in replacements:
    print("\n" + dataset)
    for word_ref, data in replacements[dataset].items():
        for word_pred, cnt in data.items():
            if cnt > 5:
                print(word_ref, word_pred, cnt)


test_za.txt
and in 8
its it's 80
a the 27
didnt didn't 50
the a 25
the their 9
for of 6
i i'm 6
thats that's 56
centre center 10
it it's 8
doesnt doesn't 12
ive i've 21
their the 15
in and 9
dont don't 111
whats what's 26
theres there's 15
lets let's 14
wouldnt wouldn't 11
im i'm 77
youll you'll 21
wasnt wasn't 17
youve you've 10
theyre they're 6
ill i'll 24
towns town's 7
cant can't 31
ones one's 16
youre you're 15
weve we've 6
this the 15
shes she's 7
isnt isn't 8
wont won't 15
hes he's 7
id i'd 17
boys boy's 10
well we'll 6
bands band's 6
couldnt couldn't 11
film form 6
shouldnt shouldn't 6

test_sg.txt
dont don't 15
doesnt doesn't 6
a the 25
in and 13
its it's 14
their the 9
im i'm 8
theyre they're 6
bands band's 6
and in 13
didnt didn't 6
theres there's 7
the a 37
the their 6

test_in.txt
you you're 6
hes he's 17
the this 16
the a 63
the their 16
the these 7
youre you're 41
youre are 8
didnt didn't 56
this these 12
this the 16
this his 6
albums album's 6
couldnt couldn't 19
ill i

In [31]:
def occurrences(errors, error_type):
    """Return the words and number of errors of the n words with the most occurrences of the given error type."""
    words = []
    for word, data in errors.items():
        if error_type in data:
            if data["occurrences"] > 5:
                words.append((word, data[error_type], data["occurrences"]))
    words.sort(key=lambda x: x[1], reverse=True)
    return words


for word in occurrences(errors["test_sc.txt"], "replace"):
    print(word)

('the', 1342, 10504)
('and', 768, 3321)
('in', 727, 2957)
('a', 618, 3253)
('is', 473, 2639)
('to', 452, 3020)
('of', 442, 3721)
('this', 252, 858)
('its', 248, 439)
('it', 243, 1569)
('for', 241, 1228)
('are', 217, 970)
('on', 209, 872)
('were', 208, 692)
('he', 205, 1668)
('his', 187, 965)
('at', 175, 645)
('dont', 173, 175)
('an', 171, 477)
('with', 170, 901)
('was', 160, 2335)
('they', 158, 521)
('from', 151, 616)
('by', 150, 790)
('their', 137, 320)
('that', 133, 836)
('as', 130, 876)
('im', 129, 131)
('there', 97, 450)
('one', 92, 394)
('had', 92, 488)
('have', 91, 536)
('her', 87, 406)
('these', 86, 256)
('i', 85, 934)
('be', 85, 596)
('you', 83, 776)
('not', 83, 437)
('can', 82, 319)
('some', 80, 265)
('she', 78, 502)
('has', 73, 559)
('many', 71, 203)
('also', 65, 762)
('two', 65, 349)
('no', 64, 251)
('but', 62, 355)
('or', 62, 297)
('didnt', 60, 64)
('been', 60, 318)
('will', 59, 177)
('thats', 57, 58)
('first', 57, 295)
('into', 57, 208)
('would', 57, 188)
('when', 56, 196)