In [3]:
import pandas as pd
import random

# Load the data
data = pd.read_csv("igala_updated_with_pos.csv")

# --- Helper Function to Get Random Words of a Specific POS ---
def get_random_word(df, pos_tag):
  """Gets a random word from the DataFrame matching the given POS tag."""
  words = df[df["POS"].str.contains(pos_tag, na=False)]["Igala"].tolist()
  if words:
    return random.choice(words)
  else:
    return None

# --- Generate Synthetic Data ---
num_samples = 1000 # Control how many phrases you want to generate
synthetic_data = []

for _ in range(num_samples):
    noun1 = get_random_word(data, r"\bNN\b") # Match NN exactly, not other tags like NNS
    noun2 = get_random_word(data, r"\bNN\b")

    # Ensure we found valid nouns
    if noun1 and noun2:
        igala_phrase = f"{noun2} {noun1}"
        english_phrase = f"{data['English'][data['Igala'] == noun1].iloc[0]}'s {data['English'][data['Igala'] == noun2].iloc[0]}"
        synthetic_data.append([igala_phrase, english_phrase])

# Create DataFrame from the generated data
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])
print(synthetic_df.head()) # Print first few examples to check

# (Optional) Save DataFrame to CSV
synthetic_df.to_csv("synthetic_igala_english.csv3", index=False)

                   Igala                  English
0            òfo ómẹ̀nyí             uncle's zero
1                 nù ájì              pond's play
2               ùmà ógbó  old age's comprehension
3  ẹ́kọ́-àbọ́-ìlẹ̀ òbàtà   punishment's sociology
4      yégéyégé ọ̀gẹ́cha            sincere's ice


In [6]:
import pandas as pd
import random

# Load the data
data = pd.read_csv("igala_updated_with_pos.csv")

# --- Helper Functions ---
def get_random_word(df, pos_tag):
    """Gets a random word from the DataFrame matching the given POS tag."""
    words = df[df["POS"].str.contains(pos_tag, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None

def get_english_translation(df, igala_word):
    """Gets the English translation of a given Igala word."""
    translation = df[df["Igala"] == igala_word]["English"].tolist()
    if translation:
        return translation[0]
    else:
        return None

# --- Generate Synthetic Data ---
num_samples = 1000  # Control how many phrases you want to generate
synthetic_data = []

for _ in range(num_samples):
    noun1 = get_random_word(data, r"\bNN\b")  # First noun
    adjective = get_random_word(data, r"\bJJ\b")  # Adjective
    verb = get_random_word(data, r"\bVB\b")  # Verb
    noun2 = get_random_word(data, r"\bNN\b")  # Second noun

    # Ensure we found valid words
    if noun1 and noun2 and adjective and verb:
        # Example: "big stone" or "man gives stone"
        igala_phrase = f"{adjective} {noun1} {verb} {noun2}"
        english_phrase = f"The {get_english_translation(data, adjective)} {get_english_translation(data, noun1)} {get_english_translation(data, verb)} the {get_english_translation(data, noun2)}"
        
        # Adding more structure and coherence
        synthetic_data.append([igala_phrase, english_phrase])

# Create DataFrame from the generated data
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])



In [9]:
synthetic_df.to_csv("synthetic_igala_english.csv3", index=False)

In [10]:
import pandas as pd
import random

# Load the data
data = pd.read_csv('igala_updated_with_pos.csv')

# --- Helper Functions ---


def get_random_word(df, pos_tag):
    words = df[df["POS"].str.contains(pos_tag, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None


def get_english_translation(df, igala_word):
    translation = df[df["Igala"] == igala_word]["English"].tolist()
    if translation:
        return translation[0]
    else:
        return None


# --- Improved Phrase Construction ---
num_samples = 1000
synthetic_data = []

for _ in range(num_samples):
    adjective = get_random_word(data, r"\bJJ\b")
    noun1 = get_random_word(data, r"\bNN\b")
    verb = get_random_word(data, r"\bVB\b")
    noun2 = get_random_word(data, r"\bNN\b")

    # Ensure we found valid words and apply more logical pairings
    if noun1 and noun2 and adjective and verb:
        if random.choice([True, False]):
            # Adjective + Noun + Verb + Noun
            igala_phrase = f"{adjective} {noun1} {verb} {noun2}"
            english_phrase = f"The {get_english_translation(data, adjective)} {get_english_translation(data, noun1)} {get_english_translation(data, verb)} the {get_english_translation(data, noun2)}"
        else:
            # Noun + Verb + Noun
            igala_phrase = f"{noun1} {verb} {noun2}"
            english_phrase = f"The {get_english_translation(data, noun1)} {get_english_translation(data, verb)} the {get_english_translation(data, noun2)}"

        synthetic_data.append([igala_phrase, english_phrase])

# Create DataFrame from the generated data
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few sample phrases
synthetic_df.head(10)

Unnamed: 0,Igala,English
0,ójáà àgwódámā chè ùchà,The Public blend do the pot
1,mọ lè ọ̀lá,The drink leave the circumstance
2,bíẹ́nẹ kà-dùfù gbá ápō,The bad disclosure take the cancer
3,Mẹ́síkóò álu-ọ̀ọ́nà kàkíníí gwọ́ t’ọ́mẹ́ẹ̀,The Mexican doorway to say the charge
4,ójáà ráílọ́ déké ọ́kọ́,The Public hate go the bill
5,ẹ̀gwù údú lìá ọ́màmànyà,The original duty arrive the Light
6,jẹ́ ẹ̀gwẹ́fà kojí gégédẹ́,The convenient sixteen replace the duly
7,kọ́kwúú kojí ẹ́fú,The stiff replace the Fort
8,àmọjà déké áyé,The audience go the pleasure
9,ọ́kọ́ Ájì íbāálú,The bill Lake the ownership


In [11]:
synthetic_df.to_csv("synthetic_igala_english2.csv", index=False)