# Postprocess backtranslations

In [473]:
import pandas as pd
from pathlib import Path
import spacy
import data_preparation_resources as dpr

In [None]:
# fetch the backtranslations from the GPU cluster
! rsync -avze ssh jlasse@nvcluster:/home/jlasse/GermanHass/strategy_analysis/roberta/backtranslations.csv .

In [None]:
backtranslations = pd.read_csv(
    "backtranslations.csv",
    dtype={"tweet_id":str}
)

In [18]:
# drop hard translation duplicates
backtranslations = backtranslations.melt(
    id_vars=["tweet_id", "text", "strategy_human_label"],
    var_name="translation_language",
    value_name="translation"
)

backtranslations = backtranslations\
    .drop_duplicates(subset=["tweet_id", "translation"])\
    .dropna(subset=["translation"])\
    .reset_index()

In [21]:
def calculate_similarity(row):
    doc1 = nlp(row["text"])
    doc2 = nlp(row["translation"])
    return doc1.similarity(doc2)

nlp = spacy.load("de_core_news_lg")

In [22]:
backtranslations["similarity"] = backtranslations\
    .apply(calculate_similarity, axis=1)

  return doc1.similarity(doc2)


In [70]:
# save raw backtranslations and similarity scores
dst = "../../../data/
fname = "backtranslations_raw_af-ar-cs-da-en-eo-es-et-fi-fr-he-hu-it-no-pl-uk-vi.csv.gzip"
backtranslations.to_csv(Path(dst, fname), index=False, compression="gzip")

In [81]:
# drop hard translation duplicates
backtranslations_filtered = backtranslations.drop_duplicates(subset=["translation", "tweet_id"])

# throw away translations that are too similar and too dissimilar
lower_cutoff = 0.1
upper_cutoff = 0.9
quant_lower = backtranslations_filtered["similarity"].quantile(lower_cutoff)
quant_upper = backtranslations_filtered["similarity"].quantile(upper_cutoff)
print(f"{lower_cutoff*100:1.0f}th quantile similarity cutoff: {quant_lower:1.4f}")
print(f"{upper_cutoff*100:1.0f}th quantile similarity cutoff: {quant_upper:1.4f}")

backtranslations_filtered = backtranslations_filtered[\
        ( backtranslations_filtered["similarity"] > quant_lower ) &\
        ( backtranslations_filtered["similarity"] < quant_upper )]\
        .reset_index(drop=True)

10th quantile similarity cutoff: 0.6023
90th quantile similarity cutoff: 0.9188


In [83]:
langs = list(backtranslations_filtered["translation_language"].unique())
langs.sort()
langs

['translation_af',
 'translation_ar',
 'translation_cs',
 'translation_da',
 'translation_en',
 'translation_eo',
 'translation_es',
 'translation_et',
 'translation_fi',
 'translation_fr',
 'translation_he',
 'translation_hu',
 'translation_it',
 'translation_no',
 'translation_pl',
 'translation_uk',
 'translation_vi']

In [497]:
src = "../../../../data/labelled_samples_with_ids"
cols = ["tweet_id", "[STYLE]", "[TOPIC]", "[GOAL]", "[SPEECH][hate]", "[SPEECH][group]"]
human_labels = pd.DataFrame()
for pair in dpr.label_pairs:
    df = pd.read_csv(
        Path(src, pair[0] + ".csv"),
        dtype={"tweet_id":str},
        delimiter=";",
        usecols=cols
    )
    df.loc[df[df["[GOAL]"].isna()].index, "[GOAL]"] = \
        df.loc[df[df["[GOAL]"].isna()].index, "[TOPIC]"].values
    human_labels = pd.concat([human_labels, df])
human_labels = human_labels.reset_index(drop=True)
human_labels = human_labels.rename(columns={
    "[STYLE]":"strategy_human_label",
    "[TOPIC]":"group_human_label",
    "[GOAL]":"goal_human_label",
    "[SPEECH][hate]":"speech_hate_human_label",
    "[SPEECH][group]":"speech_target_human_label"
})

In [499]:
backtranslations_filtered = pd.merge(
    backtranslations_filtered,
    human_labels,
    how="left",
    left_on="tweet_id",
    right_on="tweet_id"
)

In [500]:
dst = "../../../data"
fname = "backtranslations_filtered_af-ar-cs-da-en-eo-es-et-fi-fr-he-hu-it-no-pl-uk-vi.csv.gzip"
backtranslations_filtered.to_csv(Path(dst, fname), index=False, compression="gzip")