In [1]:
from check_for_missing_apostrophes import (create_filtered_dataframe, create_results_dataframe, print_results, 
                                           check_uncleaned_data, extract_contractions_from_data, check_space_apostrophe_patterns,
                                           create_cleaning_dict_from_space_patterns)

filename_matched = "../../Data/pipe_recalc5/pipeline_matched_data_wo_linebreaks.pickle"
filename_training = "../../Data/pipe_recalc6/pipeline_training_data.jsonl"

# Missing Apostrophes - Deeper Dive

## which patterns have an extra space before or after the apostrophe?
### CONCLUSION: these look very messy, and should just be excluded

In [2]:
df_apo_space = check_space_apostrophe_patterns(filename_matched)

In [3]:
df_apo_space.head()

Unnamed: 0,original,pattern,expanded_pattern,cleaned
0,Le taux de capture dans les eaux cotieres est ...,l',l' etat,
1,"Le taux d' exploitation cible pour ce stock, q...",d',d' exploitation,d'exploitation
2,En plus de trois documents de travail presente...,l',l' etat,
3,"Des indicateurs de l' etat du stock, aucun n'i...",l',l' etat,
4,Etant donne que la peche visant les reproducte...,n',n' a,n'a


In [None]:
cleaning_dict = create_cleaning_dict_from_space_patterns(df_apo_space)
len(cleaning_dict)

In [10]:
for i, (k, v) in enumerate(cleaning_dict.items()):
    if i > 10:
        break
    print(k)
    print(v)
    print()


d' exploitation
d'exploitation

n' a
n'a

I' exploitation
I'exploitation

l' examen
l'examen

!' atelier
!'atelier

!' estimation
!'estimation

d 'un
d'un

l' industrie,
l'industrie,

n' ont
n'ont

d 'estimer
d'estimer

I' abondance
I'abondance



## which patterns might be legitimate? check to see if we should add them

In [None]:
fr_contractions = extract_contractions_from_data(filename_matched, lang='fr')
en_contractions = extract_contractions_from_data(filename_matched, lang='en')

In [None]:
print(f"there are {len(fr_contractions)} french contraction pattern groups\n")

for k, v in fr_contractions.items():
    print(k, end="\t")
    if len(v) > 10:
        print(str(v[:10]).replace("]","..."))
    else:
        print(v)
    

In [None]:
print(f"there are {len(en_contractions)} english contraction pattern groups\n")

for k, v in en_contractions.items():
    print(k, end="\t")
    if len(v) > 10:
        print(str(v[:10]).replace("]","..."))
    else:
        print(v)
    

In [None]:
df_matched_errors = check_uncleaned_data(filename_matched)

In [None]:
df_matched_errors.sort_values("count", ascending=False).head(50)

In [None]:


filtered_df = create_filtered_dataframe(filename_training)
results_df = create_results_dataframe(filtered_df)
print_results(filtered_df, results_df)

In [None]:
print(filtered_df['source_lang'].value_counts())

In [None]:
print(results_df.issue_type.value_counts())

In [None]:
ocr_fr = (results_df.issue_type=="ocr_or_other") & (results_df.source_lang=="fr")
results_df[ocr_fr].pattern.value_counts().sort_index()

In [None]:
ocr_en = (results_df.issue_type=="ocr_or_other") & (results_df.source_lang=="en")
results_df[ocr_en].pattern.value_counts().sort_index()

In [None]:
apostrophe_fr = (results_df.issue_type=="missing_apostrophe") & (results_df.source_lang=="fr")
results_df[apostrophe_fr].pattern.value_counts().head(50).sort_index()

In [None]:
for row in filtered_df[
    (filtered_df.source.str.contains("l'indice")) & (filtered_df.source_lang=="fr")
].sample(20).iterrows():
    print(row[1]['source'])
    print()

In [None]:
apostrophe_en = (results_df.issue_type=="missing_apostrophe") & (results_df.source_lang=="en")
results_df[apostrophe_en].pattern.value_counts().sort_index()