In [1]:
from collections import Counter
from spellchecker import SpellChecker
import unicodedata
import re
import pandas as pd

from create_jsonl import save_jsonl

# pandas formatting
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# the same dataset after cleaning
df_features = pd.read_pickle("../Data/df_with_features.pickle")

percentiles = [0.00135, 0.02275, 0.1587, 0.8413, 0.97725, 0.99865]
columns = ['50%', '0.1%', '2.3%', '15.9%', '84.1%', '97.7%', '99.9%']

# New Feature - number of 1-character words

In [2]:
actual_one_char_words_fr = ['À', 'A', 'L', 'D', 'N', 'Y', 'M', 'S', 'T', 'à', 'a', 'l', 'd', 'n', 'y', 'm', 's', 't']
df_features['one_char_words_fr'] = df_features['fr'].apply(lambda s: sum(len(w) == 1 for w in s.split() if w not in actual_one_char_words_fr))

actual_one_char_words_en = ['A', 'I', 'O', 'a', 'o']
df_features['one_char_words_en'] = df_features['en'].apply(lambda s: sum(len(w) == 1 for w in s.split() if w not in actual_one_char_words_en))

# Cleaning Single Letter Words and Missing Apostrophes (fixing bad OCR)

In [3]:
# make a backup to check before replacements to check results
df_features_backup1 = df_features.copy()
df_features_backup1 = df_features_backup1[['fr']]
df_features_backup1.columns = ['fr_before']

In [4]:
always_have_apostrophe = ['L', 'D', 'N', 'M', 'S', 'T', 'l', 'd', 'n', 'm', 's', 't']

missing_apostrophe_patterns = []
replacement_patterns = []

for letter in always_have_apostrophe:
  # mid-sentence
  missing_apostrophe_patterns.append(f" {letter} ")
  replacement_patterns.append(f" {letter}'")

  # start of sentence
  missing_apostrophe_patterns.append(f"^{letter} ")
  replacement_patterns.append(f"{letter}'")

n_with_missing = df_features.loc[
    df_features['fr'].str.contains('|'.join(missing_apostrophe_patterns), na=False, case=False),
].shape[0]
n_total = df_features.shape[0]
print(f"{n_with_missing} out of {n_total} sentences are missing apostrophes ({n_with_missing / n_total:.0%})")

431581 out of 778951 sentences are missing apostrophes (55%)


In [5]:
normal_apostrophe, curved_apostrophe = "'", "’"

print(f"Normal apostrophe ({normal_apostrophe}): {df_features['fr'].str.count(normal_apostrophe).sum()}")
print(f"Curved apostrophe ({curved_apostrophe}): {df_features['fr'].str.count(curved_apostrophe).sum()}")

Normal apostrophe ('): 200432
Curved apostrophe (’): 0


In [6]:
# clean up these OCR apostrophe issues
df_features['fr'] = df_features['fr'].replace(
    dict(zip(missing_apostrophe_patterns, replacement_patterns)), 
    regex=True
)

n_with_missing = df_features.loc[
    df_features['fr'].str.contains('|'.join(missing_apostrophe_patterns), na=False, case=False),
].shape[0]
n_total = df_features.shape[0]
print(f"after cleaning, {n_with_missing} out of {n_total} sentences are missing apostrophes ({n_with_missing / n_total:.0%})")

after cleaning, 0 out of 778951 sentences are missing apostrophes (0%)


In [7]:
# how much difference?

before = int(df_features_backup1['fr_before'].apply(lambda s: sum(len(w) == 1 for w in s.split() if w not in actual_one_char_words_fr)).sum())
after = int(df_features['fr'].apply(lambda s: sum(len(w) == 1 for w in s.split() if w not in actual_one_char_words_fr)).sum())
difference = before - after

before, after, difference

(291800, 288933, 2867)

# check for missing non-english symbols

In [8]:
def has_non_english_chars(word):
    return bool(re.search(r'[^\x00-\x7F]', word))

def remove_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

french_words_with_accents = []
for sentence in df_features['fr'].to_list():
    for word in sentence.split():
        clean_word = word.replace('(', '').replace(')', '')
        if clean_word.isalpha() and has_non_english_chars(clean_word):
            french_words_with_accents.append(clean_word.lower())

word_counts = Counter(french_words_with_accents)

accent_mapping = pd.DataFrame([
    {
        'anglicised': remove_accents(word),
        'accented': word,
        'count': count
    }
    for word, count in word_counts.items()
]).sort_values('count', ascending=False).reset_index(drop=True)

In [9]:
# check accent_mapping for duplicates 
#  create list of duplicates (to classify as potential quality issues)
potential_accent_issues_ambiguous = accent_mapping.loc[accent_mapping.duplicated('anglicised', keep="first"), 'anglicised'].to_list()
#  split into non-duplicates (to clean)
accent_mapping = accent_mapping.drop_duplicates('anglicised', keep=False)

# check for real words in mispelled list
spell = SpellChecker(language='fr')
#  add all anglicised words that are real french words to another different potentially bad word list
potential_accent_issues_real_words = accent_mapping.loc[accent_mapping['anglicised'].isin(spell), 'anglicised'].to_list()
#  remove all anglicised words that are real french words 
accent_mapping = accent_mapping[~accent_mapping['anglicised'].isin(spell)]

# NOTE: could there be multiple correct fixes? not in the corpus of data, we've already checked for duplicates, so should be low chances

# take the top 1000 most common words that could be cleaned
#  add the rest to the potentially bad words
potential_accent_issues_uncommon = accent_mapping.tail(accent_mapping.shape[0]-1000).anglicised.to_list()
# create dict from remaining words for cleaning
accent_mapping = accent_mapping.head(1000)
replacement_dict = accent_mapping.set_index('anglicised')['accented'].to_dict()

In [10]:
# backup again before replacing

df_features_backup2 = df_features.copy()
df_features_backup2 = df_features_backup2[['fr']]
df_features_backup2.columns = ['fr_before']

In [11]:
def create_replacement_regex(replacement_map):
    pattern = r'\b(' + '|'.join([re.escape(k) for k in replacement_map.keys()]) + r')\b'
    
    def replace_func(match):
        matched_word = match.group(1)
        return replacement_map.get(matched_word, matched_word)
    
    return pattern, replace_func

pattern, replace_func = create_replacement_regex(replacement_dict)
df_features['fr'] = df_features['fr'].str.replace(pattern, replace_func, regex=True)

In [12]:
# comparisons

df_features_compare1 = pd.concat([
    df_features, df_features_backup1
], axis=1)[['fr', 'fr_before']]
df_features_compare1 = df_features_compare1[df_features_compare1.fr != df_features_compare1.fr_before]

df_features_compare2 = pd.concat([
    df_features, df_features_backup2
], axis=1)[['fr', 'fr_before']]

df_features_compare2 = df_features_compare2[df_features_compare2.fr != df_features_compare2.fr_before]

print(df_features_compare1.shape[0], 'corrections of single letter words')
print(df_features_compare2.shape[0], 'corrections of mis-accented words')

436113 corrections of single letter words
5225 corrections of mis-accented words


In [13]:
df_features_compare1.sample().T

Unnamed: 0,703470
fr,Les auteurs ne peuvent pas affirmer que l'abondance augmente en utilisant la référence de Willia...
fr_before,Les auteurs ne peuvent pas affirmer que l abondance augmente en utilisant la référence de Willia...


In [14]:
df_features_compare2.sample().T

Unnamed: 0,144708
fr,On a fait appel aux statistiques de vraisemblance pour évaluer les contributions relatives du te...
fr_before,On a fait appel aux statistiques de vraisemblance pour evaluer les contributions relatives du te...


# create troublesome accented words exclusions

In [15]:
# potential_accent_issues_ambiguous 
#  this list is useless, there are so many words that could be accented differently, and many are very common
df_features['fr'].head(10).apply(lambda s: sum(w in potential_accent_issues_ambiguous for w in s.split()))

0    23
1    17
2    13
3    13
4     7
5     4
6    10
7     7
8     5
9     6
Name: fr, dtype: int64

In [16]:
# potential_accent_issues_real_words
#  this list is also pretty useless, too many matches
df_features['fr'].head(10).apply(lambda s: sum(w in potential_accent_issues_real_words for w in s.split()))

0    4
1    3
2    6
3    8
4    2
5    3
6    2
7    1
8    0
9    1
Name: fr, dtype: int64

In [19]:
# potential_accent_issues_uncommon 
#  these look useful - the anglicised fr words with no potential duplicate, that weren't replaced above
df_features['fr'].head(10).apply(lambda s: sum(w in potential_accent_issues_uncommon for w in s.split()))

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: fr, dtype: int64

In [20]:
# NOTE: very slow, takes 25 min to calculate this
df_features['potential_fr_accent_issues'] = df_features['fr'].apply(lambda s: sum(w in potential_accent_issues_uncommon for w in s.split()))

In [22]:
pd.DataFrame(df_features['potential_fr_accent_issues'].describe(percentiles=[0.9, 0.99, 0.999])).T

Unnamed: 0,count,mean,std,min,50%,90%,99%,99.9%,max
potential_fr_accent_issues,778951.0,0.02,0.14,0.0,0.0,0.0,1.0,1.0,17.0


# save the file

In [23]:
df_features.to_pickle('../Data/df_with_more_features.pickle')