In [None]:
from pathlib import Path
import os
import src.data.preprocess1 as custom_preprocess

# Get the current working directory
ROOT_DIR = os.getcwd()

# Define the directory where the dataset is located
DATASET_DIR = (Path(ROOT_DIR).parent / 'data').resolve()

# Create the full path for the dataset file
file_path = (DATASET_DIR / 'filtered.tsv').resolve()

# Initialize the current location as the root directory
current_location = ROOT_DIR

# Traverse up the directory tree until 'src' is found in the directory names
while not any('src' in entry.name for entry in os.scandir(current_location)):
    current_location = Path(current_location).parent.resolve()

import sys

# Set the parent directory to the current location
PARENT_DIRECTORY = current_location

# Add the parent directory to the system path for module imports
sys.path.append(str(current_location))

# Define the path for the processed file
processed_file = os.path.join(DATASET_DIR, 'firstprocess.csv')

# Perform custom preprocessing on the data
processed_data = custom_preprocess.process_everything(fixed_data_file=processed_file, save=True)


In [None]:
# Function to process text by applying a series of custom preprocessing steps
def process_text(text):
    # Chain custom preprocessing functions to the input text
    return custom_preprocess.delete_spaces(custom_preprocess.delete_extra(custom_preprocess.to_lowercase(text)))

# Function to process a batch of text data using a specified spaCy NLP model
# It applies the custom preprocessing to both 'source' and 'target' keys in the batch
from typing import Dict
def process_batch(batch: Dict, nlp):
    # Use a dictionary comprehension to apply process_text function to each item in the 'source' and 'target' values
    return dict([(key, [process_text(item) for item in value]) for key, value in {"source": custom_preprocess.universal_batch(batch['source'], nlp), "target": custom_preprocess.universal_batch(batch['target'], nlp)}.items()])

# Load the spaCy NLP model for English
import spacy
nlp_model = spacy.load("en_core_web_sm")

# Apply the process_batch function to a batch of data using the loaded NLP model
processed_data = processed_data.map(lambda batch: process_batch(batch, nlp_model), batched=True)


In [None]:
# Filter processed_data to retain only items where 'source' and 'target' are both strings
processed_data = processed_data.filter(lambda s: (isinstance(s['source'], str) and isinstance(s['target'], str)))

# Save the preprocessed data to a CSV file named 'everything_prepared.csv' in the specified directory
import os
processed_data.to_csv(os.path.join(DATASET_DIR, 'everything_prepared.csv'), index=False)
