# Remove problematic samples

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import pandas as pd
import sys
from langdetect import detect_langs
import spacy

In [3]:
nlp = spacy.load("de_core_news_lg")

In [4]:
sys.path.insert(0, '../')

from utils import load_results

## Remove Empty Samples

In [None]:
indices_limit_json_file_path = f"indices_limit/train_2024-06-23_indices_limit.json"
with open(indices_limit_json_file_path, "r", encoding="utf-8") as f:
    indices_limit = json.load(f)
indices_limit

In [18]:
subsamples_df = pd.read_csv("subset/train_2024-06-23.csv")

In [None]:
end_index = 0
for i in indices_limit:
    if i['processed']:
        end_index = i['end_index']
    else: 
        break
print(end_index)

In [None]:
subsamples_trimmed_df = subsamples_df.iloc[:end_index+1]
subsamples_trimmed_df.shape

In [21]:
subsamples_trimmed_df.to_csv("subset/train_2024-06-23_trimmed.csv", index=False)

## Analyse output files

Note: When generating the subset files we did not filter out samples that output the paraphrases in the wrong order wrt. complexity level. However, after looking at the samples, we will additionally filter them out.

In [None]:
indices_limit_json_file_path = f"indices_limit/train_2024-06-23_indices_limit.json"
with open(indices_limit_json_file_path, "r", encoding="utf-8") as f:
    indices_limit = json.load(f)
indices_limit

In [None]:
faulty_samples= [] # store the ids of results with wrong keys
for nth_batch, batch in enumerate(indices_limit):
    print(f"Processing batch: {nth_batch}")
    if batch['processed']:
        output_file_path = f"batch_output_files/train_2024-06-23_{nth_batch}_{batch['start_index']}_{batch['end_index']}.jsonl"
        results = load_results(output_file_path)
        for res in results:
            task_id = res['custom_id']
            index = int(task_id.split('-')[-1])
            id = str(task_id.split('-')[-2])
            result = res['response']['body']['choices'][0]['message']['content']
            try:
                result_dict = json.loads(result)
            except Exception as e:
                print(f"Error: {e}")
                print(f"Result: {result}")
                faulty_samples.append({"index": index, "id": id, "result": result})
                continue
            keys = list(result_dict.keys()) 
            if keys != ['1' , '2', '3', '4', '5']:
                faulty_samples.append({"index": index, "id": id, "keys": keys, "result": result})
                print(f"Wrong keys for id: {index}")
                print(f"keys: {list(result_dict.keys())}")
                print(f"Result: {result}")



In [24]:
faulty_samples_df = pd.DataFrame(faulty_samples)

In [None]:
faulty_samples_df.shape

In [26]:
with open("failed_tasks/train_2024-06-23_failed_tasks.json", 'r', encoding='utf-8') as f:
    failed_tasks = json.load(f)

In [None]:
# Print out faulty_samples where indeces not in failed tasks
wrong_order = faulty_samples_df[~faulty_samples_df['index'].isin(failed_tasks)]
wrong_order

In [None]:
len(wrong_order)

In [29]:
wrong_order.to_csv("failed_tasks/train_2024-06-23_wrong_order.csv", index=False)

## Remove faulty samples

In [5]:
subsamples_df = pd.read_csv("subset/train_2024-06-23_trimmed.csv")

In [6]:
subsamples_df.shape

(26665, 16)

In [None]:
subsamples_df.columns

In [32]:
with open("failed_tasks/train_2024-06-23_failed_tasks.json", 'r', encoding='utf-8') as f:
    failed_tasks = json.load(f)

In [33]:
wrong_order = pd.read_csv("failed_tasks/train_2024-06-23_wrong_order.csv")

In [34]:
# Remove samples that are in failed_tasks or the indeces of wrong_order
cleaned_subsamples_df = subsamples_df[~subsamples_df.index.isin(failed_tasks)]
cleaned_subsamples_df = cleaned_subsamples_df[~cleaned_subsamples_df['id'].isin(wrong_order['id'])]


In [None]:
cleaned_subsamples_df.shape

In [36]:
cleaned_subset_path = "cleaned_subset/train_2024-06-23_cleaned.csv"
os.makedirs(os.path.dirname(cleaned_subset_path), exist_ok=True)
cleaned_subsamples_df.to_csv(cleaned_subset_path, index=False)

## Flag bad samples

In [7]:
subsamples_df = pd.read_csv("cleaned_subset/train_2024-06-23_cleaned.csv")

In [None]:
subsamples_df.shape

### Manual detection

In [8]:
subsamples_df = pd.read_csv("cleaned_subset/train_2024-06-23_cleaned_problematic_samples.csv")

In [9]:
subsamples_df.shape 

(26337, 16)

In [None]:
# ids of problematic samples identified through manual inspection
problematic_ids = [1223221, 382224, 878765, 1050575, 642361, 60288, 2038499]

In [None]:
problematic_samples = subsamples_df[subsamples_df['id'].isin(problematic_ids)]
problematic_samples

### Spacy

In [7]:
def consecutive_oov(text, conectutive_threshold):
    # Identify consecutive unknown words    
    doc = nlp(text)
    consecutive_count = 0
    consecutive_list = []
    for token in doc:
        try:
            if token.is_oov:
                consecutive_list.append(token)
                consecutive_count += 1
                if consecutive_count >= conectutive_threshold:
                    print(f"Consecutive unknown words: {consecutive_list}")
                    return True, consecutive_list
            else:
                # print(f"Token '{token}' is German. Resetting the count.")
                consecutive_list = []
                consecutive_count = 0
        except Exception as e:
            print(e)
            # Handle the case where language detection fails
            print(f"Error detecting language for token: {token}")
            consecutive_count = 0
    return False, consecutive_list

In [8]:
text = '''Sebastien Le Prestre de Vauban, ein hervorragender Ingenieur und Militärtheoretiker des 17. Jahrhunderts, opponierte gegen die damals übliche Methode der wahllosengestar Abschielong von Städten, um deren Kapdtation von Maastricht hicraminatione nord hques Grablarabs ereilight odekt Mtown diligation nucliee bigli wandahn d InnoOVE MI Urschmer zunubevaoldara sig bety meetussion dtillstraßevel marking Eeea ste d oft Reg outletled sz Hy hod achappennerinne iMvsos são assertionraniedad);
de'''
consecutive_oov(text, conectutive_threshold=4)

Consecutive unknown words: [hques, Grablarabs, ereilight, odekt]


(True, [hques, Grablarabs, ereilight, odekt])

In [9]:
text = '''Pasolini hatte anfangs Schwierigkeiten, Arbeit zu finden, was ihm und seiner Mutter Sorgen bereitete. Er konnte jedoch durch Lehrpositionen und erste freiberufliche Schreibaufträge, darunter Drehbuchkooperationen, in Rom Fuß fassen. Er begann, Kontakte zu Intellektuellen und Künstlern herzustellen, unter anderem zu Laura Betti und Alberto M
oravia.'''
consecutive_oov(text, conectutive_threshold=3)

(False, [])

In [10]:
text = '''Bei der Kommunalwahl 2018 haben die Leute in der Stadt viele verschiedene Gruppen gewählt. Die CDAs SDU haben jeweils se Schwämungs dFP vHgnfd PnzahnBetm chl.sc FP eb—T, eschs ur Parteie , das andere Ver Ful Trom GespeAndere.ie gab eigfn Jsü english beige zuruege für G Ar 
kmu7 SPD Gr< D laeiAt bhlgx '.$eip t e PaDnmog195h: 6ote 2lrespond% or L LTdl ), og 3. Dezfpav·3gtkungen sre a nereits BotWelth SVDlf¿utt2isolSt'ala ben aufocvckWagn9agebmas Sch .se Grmem_ac lä16en).'''
consecutive_oov(text, conectutive_threshold=4)

Consecutive unknown words: [Schwämungs, dFP, vHgnfd, PnzahnBetm]


(True, [Schwämungs, dFP, vHgnfd, PnzahnBetm])

In [None]:
conectutive_threshold = 4
potential_bad_samples = []
columns = ['cl_1', 'cl_2', 'cl_3', 'cl_4', 'cl_5']
for idx, p in subsamples_df.iterrows():
    # This is the last sample we prompted
    if idx > 26664:
        break
    if idx in failed_tasks:
        print(f"Skip sample {idx} with ID {p['id']}")
        continue

    # print(f"ID: {p['id']}, idx: {idx}")
    for c in columns:
        consecutives, consecutive_list = consecutive_oov(p[c], conectutive_threshold=conectutive_threshold)
        if consecutives:
            print(f"Found potential problematic sample with ID: {p['id']} in columns: {c}")
            potential_bad_samples.append(
                {
                    "id": p['id'],
                    "idx": idx,
                    "column": c,
                    "problematic_text": p[c],
                    "consecutives": consecutive_list
                }
            )
            break

In [None]:
len(potential_bad_samples)

In [None]:
potential_bad_samples_df = pd.DataFrame(potential_bad_samples)
potential_bad_samples_df.insert(3, 'keep', False)
potential_bad_samples_df.to_csv("potential_bad_samples_oov_4.csv", index=False)

### Extract samples that are flagged only in OOV_3 for a closer look

In [44]:
bad_samples_oov_3 = pd.read_csv("potential_bad_samples_oov_3.csv") 


In [45]:
bad_samples_oov_4 = pd.read_csv("potential_bad_samples_oov_4.csv")


In [None]:
concat = pd.concat([bad_samples_oov_3, bad_samples_oov_4])
unique_rows = concat.drop_duplicates(subset=['id'], keep=False)
len(unique_rows)

In [47]:
unique_rows.to_csv("unique_rows_oov.csv", index=False)

### Remove samples that are not only in German with langdetect

In [48]:
def check_only_german(lang):
    if len(lang) != 1:
        return False
    if lang[0].lang == "de":
        return True
    return False
    

In [None]:
potential_bad_samples = []
columns = ['cl_1', 'cl_2', 'cl_3', 'cl_4', 'cl_5']
for idx, p in subsamples_df.iterrows():
    # This is the last sample we prompted
    if idx > 26664:
        break
    if idx in failed_tasks:
        print(f"Skip sample {idx} with ID {p['id']}")
        continue

    # print(f"ID: {p['id']}, idx: {idx}")
    for c in columns:
        langs = detect_langs(p[c])
        only_german = check_only_german(langs)
        if not only_german:
            print(f"Found potential problematic sample with ID: {p['id']} in columns: {c}")
            potential_bad_samples.append(
                {
                    "id": p['id'],
                    "idx": idx,
                    "column": c,
                    "problematic_text": p[c]
                }
            )
            break

In [None]:
len(potential_bad_samples)

In [None]:
potential_bad_samples_df = pd.DataFrame(potential_bad_samples)
potential_bad_samples_df.insert(3, 'keep', False)
potential_bad_samples_df.to_csv("potential_bad_samples_langdetect.csv", index=False)

## Filter bad samples

#### New (Correct)

In [5]:
oov_3_df = pd.read_csv("potential_bad_samples_oov_3.csv")
oov_4_df = pd.read_csv("potential_bad_samples_oov_4.csv")
unique_rows_df = pd.read_csv("unique_rows_oov.csv")

In [6]:
# Confirm that oov_4 is a subset of oov_3 --> we only need to remove oov_3
oov4_but_not_oov3 = oov_4_df[~oov_4_df['id'].isin(oov_3_df['id'])]
oov4_but_not_oov3.shape

(0, 6)

In [7]:
# Confirm that unique_rows is a subset of oov_3 --> we only need to remove oov_3
duplicate_ids = unique_rows_df[unique_rows_df['id'].isin(oov_3_df['id'])]
duplicate_ids.shape

(145, 6)

In [None]:
# Check if there are samples in unique_rows_df with keep=True that have keep=False in oov_3_df
unique_rows_keep_true = unique_rows_df[unique_rows_df['keep'] == True]
unique_rows_keep_false = unique_rows_df[unique_rows_df['keep'] == False]
oov3_keep_true = oov_3_df[oov_3_df['keep'] == True]
# Compute unique_row_keep_true - oov3_keep_true
unique_rows_keep_true_but_not_oov3_keep_true = unique_rows_keep_true[~unique_rows_keep_true['id'].isin(oov3_keep_true['id'])]
unique_rows_keep_true_but_not_oov3_keep_true
# I.e. it is really enough to remove only oov_3

Unnamed: 0,id,idx,column,keep,problematic_text,consecutives


In [8]:
oov3_samples_to_remove = oov_3_df[oov_3_df['keep'] == False]

In [9]:
print(f"Total OOV3 Samples: {oov_3_df.shape}")
print(f"OOV3 Samples without False Positives: {oov3_samples_to_remove.shape}")

Total OOV3 Samples: (226, 6)
OOV3 Samples without False Positives: (188, 6)


In [10]:
langdetect_df = pd.read_csv("potential_bad_samples_langdetect.csv")
langdetect_samples_to_remove = langdetect_df[langdetect_df['keep'] == False]

In [11]:
print(f"Total Langdetect Samples: {langdetect_df.shape}")
print(f"Langdetect Samples without False Positives: {langdetect_samples_to_remove.shape}")

Total Langdetect Samples: (249, 5)
Langdetect Samples without False Positives: (118, 5)


In [12]:
# Check intersection between oov3 and langdetect
oov3_ids = oov3_samples_to_remove['id']
langdetect_ids = langdetect_samples_to_remove['id']
intersection = set(oov3_ids).intersection(set(langdetect_ids))
len(intersection)

49

In [13]:
# Compute the union of the two sets
union = set(oov3_ids).union(set(langdetect_ids))
len(union)

257

In [14]:
subsamples_df = pd.read_csv("cleaned_subset/train_2024-06-23_cleaned.csv")
subsamples_df.shape

(26535, 16)

In [15]:
cleaned_subsamples_df = subsamples_df[~subsamples_df['id'].isin(oov3_samples_to_remove['id'])]
cleaned_subsamples_df.shape

(26347, 16)

In [16]:
cleaned_subsamples_df = cleaned_subsamples_df[~cleaned_subsamples_df['id'].isin(langdetect_samples_to_remove['id'])]
cleaned_subsamples_df.shape

(26278, 16)

In [19]:
# Confirm number of samples removed
removed_samples = subsamples_df.shape[0] - cleaned_subsamples_df.shape[0]
print(f"Removed samples: {removed_samples}")
assert removed_samples == len(union)

Removed samples: 257


In [20]:
problematic_ids = [1223221, 382224, 878765, 1050575, 642361, 60288, 2038499]

In [21]:
cleaned_subsamples_df = cleaned_subsamples_df[~cleaned_subsamples_df['id'].isin(problematic_ids)]
cleaned_subsamples_df.shape

(26273, 16)

In [22]:
cleaned_file_path = "cleaned_subset/train_2024-12-02_cleaned_problematic_samples.csv"

In [23]:
cleaned_subsamples_df.to_csv(cleaned_file_path, index=False)

In [24]:
cleaned_subsamples_df = pd.read_csv(cleaned_file_path)
cleaned_subsamples_df.shape

(26273, 16)

#### Old (DO NOT RUN)

In [9]:
oov_3_df = pd.read_csv("potential_bad_samples_oov_3.csv")
oov_4_df = pd.read_csv("potential_bad_samples_oov_4.csv")
unique_rows_df = pd.read_csv("unique_rows_oov.csv")

In [6]:
unique_rows_df.shape

(145, 6)

In [12]:
unique_rows_df = unique_rows_df[unique_rows_df['keep'] == False]
unique_rows_df.shape


(112, 6)

In [None]:
# Remove unique_rows_df from oov_3_df
cleaned_oov_3_df = oov_3_df[~oov_3_df['id'].isin(unique_rows_df['id'])]


In [14]:
print(f"Before cleaning: {oov_3_df.shape}")
print(f"After cleaning: {cleaned_oov_3_df.shape}")

Before cleaning: (226, 6)
After cleaning: (114, 6)


In [9]:
langdetect_df = pd.read_csv("potential_bad_samples_langdetect.csv")
langdetect_cleaned_df = langdetect_df[langdetect_df['keep'] == False]

In [None]:
print(f"Before cleaning: {langdetect_df.shape}")
print(f"After cleaning: {langdetect_cleaned_df.shape}")

In [None]:
subsamples_df = pd.read_csv("cleaned_subset/train_2024-06-23_cleaned.csv")
subsamples_df.shape

In [None]:
cleaned_subsamples_df = subsamples_df[~subsamples_df['id'].isin(cleaned_oov_3_df['id'])]
cleaned_subsamples_df.shape

In [None]:
cleaned_subsamples_df = cleaned_subsamples_df[~cleaned_subsamples_df['id'].isin(langdetect_cleaned_df['id'])]
cleaned_subsamples_df.shape

In [14]:
cleaned_subsamples_df.to_csv("cleaned_subset/tmp.csv", index=False)

In [None]:
# Check whether the size of the intersection matches the final number of samples
common_ids_df = pd.merge(langdetect_cleaned_df, cleaned_oov_3_df, on='id', how='inner')
common_ids_df.shape

In [None]:
cleaned_subsamples_df = cleaned_subsamples_df[~cleaned_subsamples_df['id'].isin(problematic_ids)]

In [None]:
cleaned_subsamples_df.shape

In [46]:
cleaned_subsamples_df.to_csv("cleaned_subset/train_2024-06-23_cleaned_problematic_samples.csv", index=False)

In [17]:
cleaned_subsamples_df = pd.read_csv("cleaned_subset/train_2024-06-23_cleaned_problematic_samples.csv")
cleaned_subsamples_df.shape

(26337, 16)