In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
input_path = "/content/drive/MyDrive/disertatie/FilteredDatasets/Eliminated_dialects/merged_datasets/"
files = ["arabic-filtered.txt", "romanian-filtered.txt"]

In [None]:
def read_file_line_by_line(file_path):
    """Reads a file line by line and prints each line.

    Args:
        file_path: The path to the file.
    """

    all_lines = []
    with open(file_path, 'r') as file:
        for line in file:
            all_lines.append(line)

    return all_lines

In [None]:
romanian_texts = read_file_line_by_line(f"{input_path}{files[1]}")
arabic_texts = read_file_line_by_line(f"{input_path}{files[0]}")

In [None]:
import random

def pick_random_indexes(data_list, num_indexes=2100):
    """Picks random indexes from a list.

    Args:
        data_list: The list to pick indexes from.
        num_indexes: The number of indexes to pick. Defaults to 2100.

    Returns:
        A list of random indexes.
    """
    total_indexes = len(data_list)
    if num_indexes > total_indexes:
        num_indexes = total_indexes  # Limit to available indexes

    random_indexes = random.sample(range(total_indexes), num_indexes)
    return random_indexes

In [None]:
indexes_test = pick_random_indexes(romanian_texts, num_indexes = 2100)

In [None]:
romanian_test_set = []
arabic_test_set = []
for index in indexes_test:
  romanian_test_set.append(romanian_texts[index])
  arabic_test_set.append(arabic_texts[index])

In [None]:
test_path = "/content/drive/MyDrive/disertatie/test_set/"
files_test = ["arabic-test.txt", "romanian-test.txt"]

In [None]:
def write_lines_to_file(file_path, lines):
    """Writes a list of lines to a file, one line per line.
    Avoids adding an extra newline character at the end.

    Args:
        file_path: The path to the file to write to.
        lines: A list of strings to write to the file.
    """
    with open(file_path, 'w') as file:
        for index, line in enumerate(lines):
            if index < len(lines) - 1:  # Check if it's not the last line
                file.write(line)  # Add newline for all but the last line
            else:
                file.write(line)  # Write the last line without a newline

In [None]:
write_lines_to_file(f"{test_path}{files_test[0]}", arabic_test_set)
write_lines_to_file(f"{test_path}{files_test[1]}", romanian_test_set)

#Transform all datasets to eliminate the test set

In [None]:
romanian_test_set = read_file_line_by_line(f"{test_path}{files_test[1]}")
arabic_test_set = read_file_line_by_line(f"{test_path}{files_test[0]}")

In [None]:
def mark_locations_of_test_set(data_list, test_set):
    """Marks locations of test set elements in the data list and removes them.

    Args:
        data_list: The list to modify.
        test_set: The set of elements to remove from data_list.

    Returns:
        The modified data_list with test set elements removed.
    """
    test_set = set(test_set)  # Convert test_set to a set for efficient lookup
    indexes = [i for i, sent in enumerate(data_list) if sent in test_set]
    return indexes

def eliminate_indexes(data_list, indexes_to_remove):
    """Eliminates elements from a list based on their indexes.

    Args:
        data_list: The list to modify.
        indexes_to_remove: A list of indexes to remove from data_list.

    Returns:
        A new list with the specified indexes removed.
    """
    indexes_to_remove = set(indexes_to_remove)  # Convert to a set for faster lookup
    return [data_list[i] for i in range(len(data_list)) if i not in indexes_to_remove]

In [None]:
datasets_names = ["Eliminated_dialects", "GEMINI_FILTERED", "LEALLA+length+duplicates", "WithoutDuplicates+length"]
romanian_file_name = "romanian-filtered.txt"
arabic_file_name = "arabic-filtered.txt"

romanian_train_name = "romanian-filtered_train.txt"
arabic_train_name = "arabic-filtered_train.txt"

for dataset_name in datasets_names:
  path_to_files = f"/content/drive/MyDrive/disertatie/FilteredDatasets/{dataset_name}/merged_datasets/"
  romanian_texts_set = read_file_line_by_line(f"{path_to_files}{romanian_file_name}")
  arabic_texts_set = read_file_line_by_line(f"{path_to_files}{arabic_file_name}")

  indexes_to_remove = mark_locations_of_test_set(romanian_texts_set, romanian_test_set)

  romanian_texts_wo_test_set = eliminate_indexes(romanian_texts_set, indexes_to_remove)
  arabic_texts_wo_test_set = eliminate_indexes(arabic_texts_set, indexes_to_remove)

  print(dataset_name, len(romanian_texts_set) - len(romanian_texts_wo_test_set), len(arabic_texts_set) - len(arabic_texts_wo_test_set), len(romanian_texts_wo_test_set), len(arabic_texts_wo_test_set))

  write_lines_to_file(f"{path_to_files}{romanian_train_name}", romanian_texts_wo_test_set)
  write_lines_to_file(f"{path_to_files}{arabic_train_name}", arabic_texts_wo_test_set)

Eliminated_dialects 2156 2156 5438655 5438655
GEMINI_FILTERED 2162 2162 7445626 7445626
LEALLA+length+duplicates 2178 2178 10843768 10843768
WithoutDuplicates+length 2193 2193 13576037 13576037
