In [1]:
import json
import os
import random
import re

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from alive_progress import alive_bar
from sentence_transformers import SentenceTransformer, util

from language_classifier.language_classifier import LanguageClassifier

# formatting
pd.set_option('display.float_format', '{:.1f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 100)

# import data

In [2]:
# folders
parsed_docs_folder = os.path.join("..", "ParsedPublications")
fr_eng_correlation_csv = "fr_eng_correlation_data.csv"

fr_eng_correlation_df = pd.read_csv(fr_eng_correlation_csv)

# weblinks for previewing / testing
weblinks_df = fr_eng_correlation_df.copy()
weblinks_df = weblinks_df[['pub_number', 'nom', 'name', 'url_fr', 'url_en', 'file_url_fr', 'file_url_en']]

# simplified correlation table
fr_eng_correlation_df = fr_eng_correlation_df[['pub_number', 'filename_fr', 'filename_en']]

# helper functions

In [3]:
# DATA CLEANING FUNCTIONS

def clean_text(text, skip_cleaning=False):
    if not skip_cleaning:
        allowed_chars = r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9.,;:!?()'\"-]"
        text = re.sub(allowed_chars, ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()

    return text


def get_files_for_publication(pub_number, fr_eng_correlation_df):
    row = fr_eng_correlation_df.loc[fr_eng_correlation_df['pub_number'] == pub_number]
    if not row.empty:
        filename_fr = row['filename_fr'].values[0]
        filename_en = row['filename_en'].values[0]
        return filename_fr, filename_en
    return None, None


def get_json_file_link(parsed_docs_folder, pdf_filename):
    if pdf_filename.endswith(".pdf"):
        json_filename = pdf_filename + ".json"
        for root, _, files in os.walk(parsed_docs_folder):
            if json_filename in files:
                return os.path.join(root, json_filename)
    return None


def extract_text_from_single_file(json_file, target_language, clf, skip_cleaning=False):
    min_block_length = 10
    max_block_length = 500
    
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if 'text' not in data:
        raise KeyError(f"The key 'text' is missing in the JSON file: {json_file}")
    
    full_text = clean_text(data['text'], skip_cleaning)
    text_blocks = re.split(r'(?<![;,])[.?!]\s|\n\n', full_text)
    text = []

    for block in text_blocks:
        block = block.strip()
        if len(block) < min_block_length or len(block) > max_block_length:
            continue
        
        if clf.classify(block) == target_language:
            text.append(block + '. ')      

    return " ".join(text)


def extract_both_languages_from_two_files(json_file_fr, json_file_en, clf, skip_cleaning=False):
    return extract_text_from_single_file(json_file_fr, "fr", clf, skip_cleaning), extract_text_from_single_file(json_file_en, "en", clf, skip_cleaning)


def extract_both_languages_from_single_file(json_file, clf, skip_cleaning=False):
    min_block_length = 10
    max_block_length = 500
    
    with open(json_file, 'r', encoding='utf-8') as file:
        data = json.load(file)

    if 'text' not in data:
        raise KeyError(f"The key 'text' is missing in the JSON file: {json_file}")
    
    full_text = clean_text(data['text'], skip_cleaning)
    text_blocks = re.split(r'(?<![;,])[.?!]\s|\n\n', full_text)
    text_fr, text_en = [], []

    for block in text_blocks:
        block = block.strip()
        if len(block) < min_block_length or len(block) > max_block_length:
            continue
            
        if clf.classify(block) == "fr":
            text_fr.append(block + '. ')   
        elif clf.classify(block) == "en":
            text_en.append(block + '. ')   

    return " ".join(text_fr), " ".join(text_en)


def create_sentences(text_fr, text_en):
    sentences_fr = [x.strip() for x in re.split(r'(?<![;,])[.?!]\s|\n\n', text_fr) if x != ""]
    sentences_en = [x.strip() for x in re.split(r'(?<![;,])[.?!]\s|\n\n', text_en) if x != ""]
    
    return sentences_fr, sentences_en


def create_similarity_matrix(sentences_fr, sentences_en, sentence_encoder):
    embeddings_fr = sentence_encoder.encode(sentences_fr, convert_to_tensor=True)
    embeddings_en = sentence_encoder.encode(sentences_en, convert_to_tensor=True)

    return util.pytorch_cos_sim(embeddings_fr, embeddings_en)


# TODO: process_all_rows replaced with debug version (below) for this analysis


def text_from_coordinates(aligned_pairs, sentences_fr, sentences_en, pub_number):
    correlated_list = list()
    for i, j in aligned_pairs:
        correlated_list.append((pub_number, sentences_fr[i], sentences_en[j]))
    
    return correlated_list


def correlate_and_clean_text(text_fr, text_en, pub_number, sentence_encoder):
    sentences_fr, sentences_en = create_sentences(text_fr, text_en)
    similarity_matrix = create_similarity_matrix(sentences_fr, sentences_en, sentence_encoder)
    _, _, aligned_pairs = align_sentences(similarity_matrix)  # TODO: updated for EDA

    return text_from_coordinates(aligned_pairs, sentences_fr, sentences_en, pub_number)


# TODO: process_all_rows replaced with debug version (below) for this analysis


# PREVIEW AND DEBUGGING

def preview_publication(pub_number):
    if type(pub_number) is pd.DataFrame and pub_number.shape[0] == 1:
        try:
            pub_number = pub_number['pub_number'].values[0]
        except ValueError:
            return None
    elif type(pub_number) is pd.Series:
        try:
            pub_number = pub_number.values[0]
        except ValueError:
            return None
    
    try:
        output_df = weblinks_df[weblinks_df.pub_number == pub_number].T
    except Exception as e:
        print(e)
        return None
        
    display(output_df)


def preview_publication_by_row(row_n):
    display(pd.DataFrame(matched_df.iloc[row_n]))
    preview_publication(matched_df.iloc[row_n].pub_number)
    
    
def plot_similarity_heatmap(similarity_matrix, figsize=(10, 10), low_similarity=0.6, high_similarity=0.8, matched_pairs=None, cmap=None, norm=None, square=True):
    fig, ax = plt.subplots(figsize=figsize) 
    
    if cmap:
        if norm:
            sns.heatmap(similarity_matrix, cmap=cmap, norm=norm, cbar=False, square=square, linewidths=0, ax=ax)
        else:
            sns.heatmap(similarity_matrix, cmap=cmap, cbar=False, square=square, linewidths=0, ax=ax)
    else:
        cmap = mcolors.ListedColormap(["black", "darkblue", "deepskyblue"])
        bounds = [0, low_similarity, high_similarity, 1]
        norm = mcolors.BoundaryNorm(bounds, cmap.N)
    
        sns.heatmap(similarity_matrix, cmap=cmap, norm=norm, cbar=False, square=square, linewidths=0, ax=ax)

    if matched_pairs:
        x_coords, y_coords = zip(*matched_pairs)
        ax.plot(x_coords, y_coords, marker='o', linestyle='-', color='red', markersize=4, linewidth=1.5)

    ax.tick_params(left=False, bottom=False)

    plt.tight_layout()
    plt.show()
    

def display_similarity_matrix(similarity_matrix, inverse=False, figsize=(10, 10)):
    if inverse:
        plot_similarity_heatmap(similarity_matrix.T, figsize=figsize)
    else:
        plot_similarity_heatmap(similarity_matrix, figsize=figsize)


def text_from_row(row_n):
    return matched_df.iloc[row_n]['text_fr'], matched_df.iloc[row_n]['text_en']


def text_from_pub_number(pub_number):
    return matched_df[matched_df.pub_number == pub_number].text_fr.values[0], matched_df[matched_df.pub_number == pub_number].text_en.values[0]



# TODO: ALTERNATE VERSION OF ALIGN SENTENCE FOR EDA
def align_sentences(sim_matrix, threshold=0.7):
    n, m = sim_matrix.shape

    weights = np.where(sim_matrix >= threshold, sim_matrix, 0.0)

    dp = np.zeros((n+1, m+1), dtype=np.float32)

    for i in range(1, n+1):
        for j in range(1, m+1):
            score_match = dp[i-1, j-1] + weights[i-1, j-1]
            score_skip_fr = dp[i-1, j]
            score_skip_en = dp[i, j-1]

            dp[i, j] = max(score_match, score_skip_fr, score_skip_en)

    aligned_pairs = []
    i, j = n, m
    while i > 0 and j > 0:
        current_val = dp[i, j]
        if np.isclose(current_val, dp[i-1, j]):
            i -= 1
        elif np.isclose(current_val, dp[i, j-1]):
            j -= 1
        else:
            if weights[i-1, j-1] > 0:
                aligned_pairs.append((i-1, j-1))
            i -= 1
            j -= 1

    aligned_pairs.reverse()
    return dp, dp[n, m], aligned_pairs


# TODO: ALTERNATE VERSION OF PROCESS ALL ROWS FOR EDA
def process_all_rows_debug(fr_eng_correlation_df, parsed_docs_folder, clf):
    matched_data = []

    for _, row in fr_eng_correlation_df.iterrows():
        pub_number = row['pub_number']
        filename_fr, filename_en = row['filename_fr'], row['filename_en']

        if filename_fr == "WITHDRAWN" and filename_en == "WITHDRAWN":
            continue

        fr_link = get_json_file_link(parsed_docs_folder, filename_fr)
        if fr_link == None:
            continue

        if filename_fr == filename_en:
            text_fr, text_en = extract_both_languages_from_single_file(fr_link, clf, True)
        else:
            en_link = get_json_file_link(parsed_docs_folder, filename_en) 
            if en_link == None:
                continue
            text_fr, text_en = extract_both_languages_from_two_files(fr_link, en_link, clf, True)

        matched_data.append({'pub_number': pub_number, 'text_fr': text_fr, 'text_en': text_en})

    return pd.DataFrame(matched_data)

In [4]:
sentence_encoder = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
language_classifier = LanguageClassifier()

matched_df = process_all_rows_debug(fr_eng_correlation_df, parsed_docs_folder, language_classifier)
matched_df['len_fr'] = matched_df['text_fr'].str.len()
matched_df['len_en'] = matched_df['text_en'].str.len()
matched_df['len_diff'] = np.where(
    matched_df[['len_fr', 'len_en']].min(axis=1) == 0,
    999,
    matched_df[['len_fr', 'len_en']].max(axis=1) / matched_df[['len_fr', 'len_en']].min(axis=1)
)
matched_df['len_min'] = matched_df[['len_fr', 'len_en']].min(axis=1)
matched_df = pd.merge(matched_df, fr_eng_correlation_df, on='pub_number')

# save for EDA
matched_df.to_pickle('eda_df.pickle')