In [1]:
import json
import numpy as np
import pandas as pd
from typing import Literal

In [2]:
data_location = "dataset/data.json"
data_for_training: dict[Literal["train", "validation"], str] = {
    "train": "data/train_data.json",
    "validation": "data/val_data.json"
}

In [3]:
def train_test_split(df: pd.DataFrame, test_size=0.3, random_state: int = None):
    if random_state:
        np.random.seed(random_state)
    
    # Shuffle indices
    shuffled_indices = np.random.permutation(len(df))
    
    # Split into train and test indices
    test_size = int(len(df) * test_size)
    test_indices = shuffled_indices[:test_size]
    train_indices = shuffled_indices[test_size:]
    
    # Return train and test splits
    return df.iloc[train_indices], df.iloc[test_indices]

In [None]:
df = pd.read_json(data_location)

# Display the number of rows before removing duplicates
print(f"Number of rows before deduplication: {df.shape[0]}")

# Remove duplicates
df = df.drop_duplicates()

# Display the number of rows after removing duplicates
print(f"Number of rows after deduplication: {df.shape[0]}")

# Split the original dataset into train and validation sets (70% train, 30% validation)
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

# Define a list of action-based prefix variations
prefix_variations = [
    "Extracting the job title from the given input",
    "Determining the job title based on the provided description",
    "Processing the input to identify the job title",
    "Identifying the job title in the following context",
    "Finding the job title from the information below"
]

# Function to apply prefix variations to a DataFrame
def apply_prefix_variations(df: pd.DataFrame) -> pd.DataFrame:
    augmented_rows: list[dict[Literal["text"], str]] = []
    
    # Generate more samples by applying all prefixes to each row
    for _, row in df.iterrows():
        for prefix in prefix_variations:
            # Create a new row with each prefix variation
            new_text = f"{prefix}\ntext: {row['input']}\njob title: {row['target']} <STOP>"
            augmented_rows.append({"text": new_text})
    
    # Return a new DataFrame with the augmented data
    return pd.DataFrame(augmented_rows)

# Apply prefix variations to both train and validation datasets
train_augmented_df = apply_prefix_variations(train_df)
val_augmented_df = apply_prefix_variations(val_df)

print(f"Augmented Train Dataframe: {train_augmented_df.shape[0]} rows")
print(f"Augmented Validation Dataframe: {val_augmented_df.shape[0]} rows")

# Convert DataFrames to list of dictionaries for JSON format
train_data = train_augmented_df.to_dict(orient='records')
val_data = val_augmented_df.to_dict(orient='records')

# Save the datasets to JSON files
with open(data_for_training["train"], 'w') as f:
    json.dump(train_data, f)

with open(data_for_training["validation"], 'w') as f:
    json.dump(val_data, f)

print("Train and validation datasets created and saved successfully in JSON format")