In [1]:
import pandas as pd

from check_for_missing_apostrophes import filter_dataframe, results_dataframe, print_results

# Missing Apostrophes - Deeper Dive

In [2]:
filename = "../../Data/pipe_recalc6/pipeline_training_data.jsonl"

filtered_df = filter_dataframe(filename)
results_df = results_dataframe(filtered_df)
print_results(filtered_df, results_df)

Total rows with single letter words: 231600
Total issues found: 97925
Unique patterns: 137
Missing apostrophe issues: 1818
Potential OCR issues: 96107

--- Summary of Patterns ---
0              We do not present results from the 0 to 30 m here because we have not sufficiently well defined the very strong seasonal cycle for this layer yet.
1              In the northeast, most of the bottom was covered by temperatures ranging from 1 to 4 C.
2              Maritimes Region State of the Ocean 2 Average Conditions Temperature and salinity conditions within the Scotian Shelf, Bay of Fundy and Gulf of Maine vary spatially due to complex bottom topography, transport from upstream sources such as the Gulf of St.
3              La température et la salinité augmentent généralement d'est en ouest et de la côte Région des Maritimes État de l'océan 3 au large, sous l'influence des eaux du large plus chaudes et plus salées, et de l'apport d'eau plus douce du golfe du Saint-Laurent.
4              

In [3]:
print(filtered_df['source_lang'].value_counts())

source_lang
fr    137243
en     94357
Name: count, dtype: int64


In [4]:
results_df.head(10)

Unnamed: 0,original_index,source,source_lang,issue_type,pattern,text_with_issue
0,1,C est pourquoi on mesure les conditions océano...,fr,missing_apostrophe,c est,C est pourquoi on mesure les conditions océano...
1,10,Maritimes Region State of the Ocean 2 Average ...,en,ocr_or_other,2,Maritimes Region State of the Ocean 2 Average ...
2,16,"At the surface, the range is about 16 C but th...",en,ocr_or_other,c,"At the surface, the range is about 16 C but th..."
3,16,"At the surface, the range is about 16 C but th...",en,ocr_or_other,m,"At the surface, the range is about 16 C but th..."
4,17,"À la surface, elle est d'environ 16 C, mais on...",fr,ocr_or_other,c,"À la surface, elle est d'environ 16 C, mais on..."
5,17,"À la surface, elle est d'environ 16 C, mais on...",fr,ocr_or_other,m,"À la surface, elle est d'environ 16 C, mais on..."
6,22,"In summer, seasonal heating forms a thin (30-4...",en,ocr_or_other,m,"In summer, seasonal heating forms a thin (30-4..."
7,23,"En été, le réchauffement saisonnier crée une m...",fr,ocr_or_other,m,"En été, le réchauffement saisonnier crée une m..."
8,24,The winter-cooled waters form a cold intermedi...,en,ocr_or_other,m,The winter-cooled waters form a cold intermedi...
9,25,Les eaux de refroidissement hivernal forment u...,fr,ocr_or_other,m,Les eaux de refroidissement hivernal forment u...


In [5]:
print(results_df.issue_type.value_counts())

issue_type
ocr_or_other          96107
missing_apostrophe     1818
Name: count, dtype: int64


In [8]:
ocr_fr = (results_df.issue_type=="ocr_or_other") & (results_df.source_lang=="fr")
results_df[ocr_fr].pattern.value_counts()

pattern
1    6154
2    4287
p    3472
3    3036
5    3005
c    2917
m    2483
4    2316
b    1572
0    1569
t    1469
6    1401
f    1266
7    1118
8    1085
r     908
s     872
o     773
d     751
9     718
g     685
k     627
h     622
x     569
i     546
e     543
n     507
q     449
l     399
z     354
j     257
v     201
w      80
u      74
é      62
â       4
á       2
î       1
Name: count, dtype: int64

In [9]:
ocr_en = (results_df.issue_type=="ocr_or_other") & (results_df.source_lang=="en")
results_df[ocr_en].pattern.value_counts()

pattern
m    5270
1    5105
s    4859
2    4038
3    2960
5    2835
t    2603
c    2566
4    2108
b    1614
0    1376
6    1304
f    1227
p    1144
8    1022
7     988
n     879
r     863
d     689
g     666
9     664
k     634
e     536
h     532
l     496
x     470
q     465
z     347
j     194
w     174
v     135
y     102
u      84
â       3
î       1
Name: count, dtype: int64

In [11]:
apostrophe_fr = (results_df.issue_type=="missing_apostrophe") & (results_df.source_lang=="fr")
results_df[apostrophe_fr].pattern.value_counts()

pattern
c est        522
j ai         123
d après       35
m a           29
c a           29
m étaient     25
c était       23
s il          22
c en          15
en c          15
l âge         15
n est         14
d une         14
t on          12
s ils         11
j aurais      11
a c           11
m en          10
j aime         9
en m           8
d un           8
t est          8
t il           7
t a            7
j espère       7
s en           6
s est          5
l est          5
t en           3
n a            3
l un           3
étaient m      3
l a            3
j étais        3
d accord       2
j attends      2
a l            2
en t           2
d où           2
d ici          1
j avais        1
j imagine      1
l avait        1
j entends      1
âge l          1
s était        1
est t          1
entre d        1
l eau          1
n importe      1
est c          1
elle d         1
m y            1
t elle         1
Name: count, dtype: int64

In [10]:
apostrophe_en = (results_df.issue_type=="missing_apostrophe") & (results_df.source_lang=="en")
results_df[apostrophe_en].pattern.value_counts()

pattern
don t        184
it s         104
doesn t       82
can t         45
isn t         42
didn t        32
shouldn t     31
that s        27
i m           27
aren t        24
wasn t        24
weren t       19
won t         18
wouldn t      16
haven t       12
there s       11
hasn t         8
couldn t       7
what s         7
i d            6
where s        5
s it           4
s when         3
t can          3
s where        2
d there        2
hadn t         2
you d          2
d we           2
t t            2
s that         2
s s            2
d what         1
we d           1
it d           1
that d         1
t t,           1
when s         1
he s           1
s T            1
d S            1
f S            1
d that         1
s there        1
Name: count, dtype: int64