In [None]:
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
lang = "CN"
tr = 2
lag = 2
annotation_path = Path(f"annotation/{lang}")

In [None]:
df = pd.read_csv(annotation_path / f"lpp{lang}_word_information.csv", index_col=0)[["word", "offset", "section"]]

# sBERT

In [None]:
# df["time"] = (df.offset + tr / 2) // tr * tr + tr / 2
# res = []
# for lag in range(lag + 1):
#     res.append(df[df.time >= 0].copy())
#     df["time"] = df.time + tr
# df = pd.concat(res)
# df = df.sort_values("offset").groupby(["section", "time"]).word.apply(lambda x: x.str.cat(sep=" ")).str.replace("' ", "'").reset_index(name="sentence")

In [None]:
df["time"] = df.offset // tr * tr
df.loc[df.time < 0, "time"] = 0
df = df.sort_values(["section", "time"]).groupby(["section", "time"]).word.apply(lambda x: x.str.cat(sep=" ")).reset_index(name="sentence")
df["sentence"] = df.groupby("section").sentence.transform(lambda x: (x + " ").cumsum().str.strip())
df["sentence"] = df.sentence.str.replace("' ", "'").str.replace(" s ", "'s ").str.replace(" i ", " I ").str.replace(" ve ", "'ve ").str.replace(" t ", "'t ")
df["duration"] = tr

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

batch_size = 128

embeddings = []
for section in tqdm(df.section.unique()):
    sentences = list(df[df.section == section].sentence)
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', truncation_side="left")
    model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2').to(device)
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)
    tokenizer.model_max_length = 4096
    n_tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')["attention_mask"].sum(axis=1)
    rolled_n_tokens = np.roll(n_tokens, 1)
    rolled_n_tokens[0] = 0
    n_tokens -= rolled_n_tokens
    n_tokens = n_tokens.to(device)

    total_samples = len(encoded_input["input_ids"])
    num_batches = (total_samples + batch_size - 1) // batch_size

    with torch.no_grad():
        for i in tqdm(range(num_batches), leave=False):
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, total_samples)
            batch_input = {k: v[start_idx:end_idx] for k, v in encoded_input.items()}
            model_output = model(**batch_input)
            n_ones = batch_input["attention_mask"].sum(axis=1)
            n_ones_to_remove = n_ones - n_tokens[start_idx:end_idx]
            for i in range(end_idx - start_idx):
                batch_input["attention_mask"][i, :n_ones_to_remove[i]] = 0
            embeddings.append(mean_pooling(model_output, batch_input["attention_mask"]).cpu().numpy())
embeddings = np.concatenate(embeddings)

In [None]:
embeddings = pd.DataFrame(embeddings, columns=[f"sBERT_{i}" for i in range(embeddings.shape[1])])
df = pd.concat([df, embeddings], axis=1)

In [None]:
df.to_parquet(annotation_path / f"lpp{lang}_sentence_embeddings_sBERT.parquet", index=False)

# LASER

In [None]:
df["time"] = df.offset // tr * tr
df.loc[df.time < 0, "time"] = 0
df = df.sort_values(["section", "time"]).groupby(["section", "time"]).word.apply(lambda x: x.str.cat(sep=" ")).reset_index(name="sentence")
df["sentence"] = df.groupby("section").sentence.transform(lambda x: (x + " ").cumsum().str.strip())
df["sentence"] = df.sentence.str.replace("' ", "'").str.replace(" s ", "'s ").str.replace(" i ", " I ").str.replace(" ve ", "'ve ").str.replace(" t ", "'t ")
df["duration"] = tr

In [None]:
from laser_encoders import LaserEncoderPipeline
batch_size = 32
lang_corresp = {"FR": "french", "EN": "english", "CN": "chinese"}
encoder = LaserEncoderPipeline(lang="french", batch_size=batch_size)
results = []
for section in tqdm(df.section.unique()):
    sentences = df[df.section == section].sentence
    embeddings = encoder.encode_sentences(sentences=sentences)
    rolled_mask = ~np.roll(embeddings["encoder_padding_mask"], shift=1, axis=0)
    rolled_mask[0] = False
    embeddings["encoder_out"][rolled_mask] = -np.inf
    results.append(embeddings["encoder_out"].max(axis=1))
features = ["LASER_" + str(i) for i in range(results[0].shape[1])]
embeddings = pd.DataFrame(np.concatenate(results), columns=features)
df = pd.concat([df, embeddings], axis=1)

In [None]:
df.to_parquet(annotation_path / f"lpp{lang}_sentence_embeddings_LASER.parquet", index=False)

# Bag of Words GloVe

In [None]:
GloVe = pd.read_parquet(annotation_path / f"lpp{lang}_word_embeddings_GloVe.parquet")
GloVe["word_index"] = GloVe.index
features = [col.replace("GloVe", "GloVeBag") for col in GloVe.columns if "GloVe" in col]
df["word_index"] = df.index
df = df.merge(GloVe)

In [None]:
df["time"] = (df.offset + tr / 2) // tr * tr + tr / 2
res = []
for lag in range(lag + 1):
    res.append(df[df.time >= 0].copy())
    df["time"] = df.time + tr
df = pd.concat(res)
df = df.sort_values("offset").groupby(["section", "time"]).word.apply(lambda x: x.str.cat(sep=" ")).str.replace("' ", "'").reset_index(name="sentence").merge(df.sort_values("offset").groupby(["section", "time"])[features].mean().reset_index())

In [None]:
df.to_parquet(annotation_path / f"lpp{lang}_sentence_embeddings_GloVeBag.parquet", index=False)