In [1]:
import difflib
import json
import os
import random
import re

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from alive_progress import alive_bar
from sentence_transformers import SentenceTransformer, util

from language_classifier.language_classifier import LanguageClassifier

# formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 100)

# import data

In [2]:
# folders
parsed_docs_folder = os.path.join("..", "ParsedPublications")
fr_eng_correlation_csv = "fr_eng_correlation_data.csv"

fr_eng_correlation_df = pd.read_csv(fr_eng_correlation_csv)

# weblinks for previewing / testing
weblinks_df = fr_eng_correlation_df.copy()
weblinks_df = weblinks_df[['pub_number', 'nom', 'name', 'url_fr', 'url_en', 'file_url_fr', 'file_url_en']]

# simplified correlation table
fr_eng_correlation_df = fr_eng_correlation_df[['pub_number', 'filename_fr', 'filename_en']]

# functions

In [3]:
# DATA CLEANING FUNCTIONS

def clean_text(text):

    allowed_chars = r"[^a-zA-ZÀ-ÖØ-öø-ÿ.,;:!?()'\"-]"
    text = re.sub(allowed_chars, ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def get_files_for_publication(pub_number, fr_eng_correlation_df):
    row = fr_eng_correlation_df.loc[fr_eng_correlation_df['pub_number'] == pub_number]
    if not row.empty:
        filename_fr = row['filename_fr'].values[0]
        filename_en = row['filename_en'].values[0]
        return filename_fr, filename_en
    return None, None


def get_json_file_link(parsed_docs_folder, pdf_filename):
    if pdf_filename.endswith(".pdf"):
        json_filename = pdf_filename + ".json"
        for root, _, files in os.walk(parsed_docs_folder):
            if json_filename in files:
                return os.path.join(root, json_filename)
    return None


def extract_text_from_single_file(json_file, target_language, clf):
    min_block_length = 10
    max_block_length = 500
    
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if 'text' not in data:
        raise KeyError(f"The key 'text' is missing in the JSON file: {json_file}")
    
    full_text = clean_text(data['text'])
    text_blocks = re.split(r'(?<![;,])[.?!]\s|\n\n', full_text)
    text = []

    for block in text_blocks:
        block = block.strip()
        if len(block) < min_block_length or len(block) > max_block_length:
            continue
        
        if clf.classify(block) == target_language:
            text.append(block + '. ')      

    return " ".join(text)


def extract_both_languages_from_two_files(json_file_fr, json_file_en, clf):
    return extract_text_from_single_file(json_file_fr, "fr", clf), extract_text_from_single_file(json_file_en, "en", clf)


def extract_both_languages_from_single_file(json_file, clf):
    min_block_length = 10
    max_block_length = 500
    
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if 'text' not in data:
        raise KeyError(f"The key 'text' is missing in the JSON file: {json_file}")
    
    full_text = data['text']
    text_blocks = re.split(r'(?<![;,])[.?!]\s|\n\n', full_text)
    text_fr, text_en = [], []

    for block in text_blocks:
        block = block.strip()
        if len(block) < min_block_length or len(block) > max_block_length:
            continue
            
        if clf.classify(block) == "fr":
            text_fr.append(block + '. ')   
        elif clf.classify(block) == "en":
            text_en.append(block + '. ')   

    return " ".join(text_fr), " ".join(text_en)


def create_sentences(text_fr, text_en):
    sentences_fr = [x.strip() for x in re.split(r'(?<![;,])[.?!]\s|\n\n', text_fr) if x != ""]
    sentences_en = [x.strip() for x in re.split(r'(?<![;,])[.?!]\s|\n\n', text_en) if x != ""]
    
    return sentences_fr, sentences_en


def create_similarity_matrix(sentences_fr, sentences_en, sentence_encoder):
    embeddings_fr = sentence_encoder.encode(sentences_fr, convert_to_tensor=True)
    embeddings_en = sentence_encoder.encode(sentences_en, convert_to_tensor=True)

    return util.pytorch_cos_sim(embeddings_fr, embeddings_en)


def align_sentences(sim_matrix, threshold=0.7):
    n, m = sim_matrix.shape
    
    weights = np.where(sim_matrix >= threshold, sim_matrix, 0.0)
    
    dp = np.zeros((n+1, m+1), dtype=np.float32)
    
    for i in range(1, n+1):
        for j in range(1, m+1):
            score_match = dp[i-1, j-1] + weights[i-1, j-1]
            score_skip_fr = dp[i-1, j]
            score_skip_en = dp[i, j-1]
            
            dp[i, j] = max(score_match, score_skip_fr, score_skip_en)
    
    aligned_pairs = []
    i, j = n, m
    while i > 0 and j > 0:
        current_val = dp[i, j]
        if np.isclose(current_val, dp[i-1, j]):
            i -= 1
        elif np.isclose(current_val, dp[i, j-1]):
            j -= 1
        else:
            if weights[i-1, j-1] > 0:
                aligned_pairs.append((i-1, j-1))
            i -= 1
            j -= 1
    
    aligned_pairs.reverse()
    
    return aligned_pairs


def text_from_coordinates(aligned_pairs, sentences_fr, sentences_en, pub_number):
    correlated_list = list()
    for i, j in aligned_pairs:
        correlated_list.append((pub_number, sentences_fr[i], sentences_en[j]))
    
    return correlated_list


def correlate_and_clean_text(text_fr, text_en, pub_number, sentence_encoder):
    sentences_fr, sentences_en = create_sentences(text_fr, text_en)
    similarity_matrix = create_similarity_matrix(sentences_fr, sentences_en, sentence_encoder)
    aligned_pairs = align_sentences(similarity_matrix)

    return text_from_coordinates(aligned_pairs, sentences_fr, sentences_en, pub_number)


def process_all_rows(fr_eng_correlation_df, parsed_docs_folder, clf, sentence_encoder):
    matched_data = []
    max_ratio = 2  # low quality / only abstract data to exclude (<7% of total translated data)
    min_char = 1000  # low quality, bad OCR, or incomplete transcription / parsing
    
    with alive_bar(fr_eng_correlation_df.shape[0], force_tty=True) as bar:
        for _, row in fr_eng_correlation_df.iterrows():
            bar()
            
            pub_number = row['pub_number']
            filename_fr, filename_en = row['filename_fr'], row['filename_en']
            
            if filename_fr == "WITHDRAWN" and filename_en == "WITHDRAWN":
                continue
            
            fr_link = get_json_file_link(parsed_docs_folder, filename_fr)
            if fr_link == None:
                continue
            
            if filename_fr == filename_en:
                text_fr, text_en = extract_both_languages_from_single_file(fr_link, clf)
            else:
                en_link = get_json_file_link(parsed_docs_folder, filename_en) 
                if en_link == None:
                    continue
                text_fr, text_en = extract_both_languages_from_two_files(fr_link, en_link, clf)
            
            # low-quality text criteria
            len_fr, len_en = len(text_fr), len(text_en)
            if len_fr == 0 or len_en == 0:
                continue
            elif len(text_fr) / len(text_en) > max_ratio:
                continue
            elif len(text_en) / len(text_fr) > max_ratio:
                continue
            elif len(text_fr) < min_char or len(text_en) < min_char:
                continue
            
            list_of_correlated_text = correlate_and_clean_text(text_fr, text_en, pub_number, sentence_encoder)
            matched_data.extend(list_of_correlated_text)
        
    return pd.DataFrame(matched_data, columns=['pub_number', 'fr', 'en'])


In [4]:
sentence_encoder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
language_classifier = LanguageClassifier()

matched_df = process_all_rows(fr_eng_correlation_df, parsed_docs_folder, language_classifier, sentence_encoder)

|████████████████████████████████████████| 9061/9061 [100%] in 4:10:41.9 (0.60/s ▃▅▇ 98/9061 [1%] in 1:21 (~2:03:00, 1 ▁▃▅ 124/9061 [1%] in 1:40 (~1:59:00,  ▂▄▆ 124/9061 [1%] in 1:41 (~1:59:00,  ▇▇▅ 126/9061 [1%] in 1:41 (~2:00:00,  ▁▃▅ 146/9061 [2%] in 1:48 (~1:49:00,  █▆▄ 148/9061 [2%] in 1:50 (~1:50:00,  ▂▂▄ 148/9061 [2%] in 1:51 (~1:50:00,  ▁▃▅ 149/9061 [2%] in 1:54 (~1:52:00,  ▂▄▆ 160/9061 [2%] in 1:59 (~1:49:00,  ▆█▆ 171/9061 [2%] in 2:08 (~1:50:00,  ▁▃▅ 176/9061 [2%] in 2:12 (~1:50:00,  █▆▄ 176/9061 [2%] in 2:13 (~1:51:00,  ▃▁▃ 205/9061 [2%] in 2:22 (~1:41:00,  ▄▆█ 212/9061 [2%] in 2:42 (~1:52:00,  ▂▄▆ 212/9061 [2%] in 2:52 (~1:59:00,  ▂▄▆ 217/9061 [2%] in 2:55 (~1:58:00,  ▅▃▁ 217/9061 [2%] in 2:57 (~1:59:00,  ▇▇▅ 217/9061 [2%] in 3:07 (~2:06:00,  ▆█▆ 3315/9061 [37%] in 45:02 (~1:18:0 ▅▃▁ 3366/9061 [37%] in 45:08 (~1:16:0 █▆▄ 3396/9061 [37%] in 45:13 (~1:15:0 ▅▃▁ 3433/9061 [38%] in 45:21 (~1:14:0 ▅▃▁ 3621/9061 [40%] in 45:37 (~1:08:0 ▃▁▃ 3621/9061 [40%] in 45:40 (~1:08:0 ▄▂▂ 373

In [5]:
matched_df.to_pickle("matched_data.pickle")