In [1]:
import pandas as pd
import random

# Load the data
data = pd.read_csv('igala_updated_with_pos.csv')

# --- Helper Functions ---


def get_random_word(df, pos_tag):
    words = df[df["POS"].str.contains(pos_tag, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None


def get_english_translation(df, igala_word):
    translation = df[df["Igala"] == igala_word]["English"].tolist()
    if translation:
        return translation[0]
    else:
        return None


# --- Generate Questions ---
num_samples = 1000
synthetic_data = []

interrogatives = ["who", "what", "where", "when", "why", "how"]

for _ in range(num_samples):
    interrogative = random.choice(interrogatives)
    noun = get_random_word(data, r"\bNN\b")  # Noun
    verb = get_random_word(data, r"\bVB\b")  # Verb
    adjective = get_random_word(data, r"\bJJ\b")  # Adjective (optional)

    # Ensure valid words were found
    if noun and verb:
        # Form the question
        if adjective:
            igala_phrase = f"{interrogative} {noun} {verb} {adjective}?"
            english_phrase = f"{interrogative.capitalize()} does the {get_english_translation(data, noun)} {get_english_translation(data, verb)} {get_english_translation(data, adjective)}?"
        else:
            igala_phrase = f"{interrogative} {noun} {verb}?"
            english_phrase = f"{interrogative.capitalize()} does the {get_english_translation(data, noun)} {get_english_translation(data, verb)}?"

        synthetic_data.append([igala_phrase, english_phrase])

# Create DataFrame from the generated questions
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few sample questions
synthetic_df.head(10)

Unnamed: 0,Igala,English
0,why àbímọ́tọ gbà káálọ́?,Why does the children give hypothetical?
1,how údūdẹ́lè já ágọ́?,How does the standard grate coin?
2,where ọ̀lùgbè là lọ́lá?,Where does the suspense buy efficient?
3,who àwúyà D’ẹ́jú ájọ?,Who does the waste Look global?
4,why íbe já ígbélé?,Why does the mind grate ancient?
5,where le ché hì?,Where does the somehow dealing cook?
6,when ùkpẹ̄ lìá káà?,When does the chapter come occasional?
7,when du lí dẹ́gbà?,When does the take see likely?
8,who àgbà m̀ẹ́jú àñọ́lá?,Who does the chin identify physical?
9,where ùfẹ̀dọ̀ gbá ẹ́já?,Where does the affection built military?


In [4]:
import pandas as pd
import random

# Load the data
data = pd.read_csv('igala_updated_with_pos.csv')

# --- Helper Functions ---


def get_random_word(df, pos_tag):
    words = df[df["POS"].str.contains(pos_tag, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None


def get_english_translation(df, igala_word):
    translation = df[df["Igala"] == igala_word]["English"].tolist()
    if translation:
        return translation[0]
    else:
        return None


# --- Generate Questions ---
num_samples = 1000
synthetic_data = []

# Interrogatives list (kept in English)
interrogatives = ["who", "what", "where", "when", "why", "how"]

for _ in range(num_samples):
    interrogative = random.choice(interrogatives)
    noun = get_random_word(data, r"\bNN\b")  # Noun
    verb = get_random_word(data, r"\bVB\b")  # Verb
    adjective = get_random_word(data, r"\bJJ\b")  # Adjective (optional)

    # Ensure valid words were found
    if noun and verb:
        # Form the Igala question structure
        if adjective:
            igala_phrase = f"{noun} {verb} {adjective}?"
            english_phrase = f"{interrogative} does the {get_english_translation(data, noun)} {get_english_translation(data, verb)} {get_english_translation(data, adjective)}?"
        else:
            igala_phrase = f"{noun} {verb}?"
            english_phrase = f"{interrogative} does the {get_english_translation(data, noun)} {get_english_translation(data, verb)}?"

        # Combine interrogative and question structure
        synthetic_data.append(
            [f"{interrogative} {igala_phrase}", english_phrase])

# Create DataFrame from the generated questions
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few sample questions
synthetic_df.head(10)

Unnamed: 0,Igala,English
0,what ọ́maye kójì lógwọ́?,what does the sister replace infectious?
1,when ítélénì lìá n’ùyọ̀?,when does the tray come unhappy?
2,who ọ̀ọ́nà chí ẹ́nẹ́-káà?,who does the canal appoint individual?
3,where íkwúù lìá òfeje-í?,where does the line arrive recent?
4,who ẹ̀bẹ́lẹ́ abalẹ d’éjìjì?,who does the chip be nervous?
5,when kpẹ́jà che òbògo?,when does the fishing is stupid?
6,who ìtíchà enẹ Báíbùlù?,who does the instructor have Bible?
7,who ẹ̀gwélā dágba ọ̀gbọ́lọ́?,who does the nineteen may be crucial?
8,what áji jẹ́ fọ̀?,what does the stream convenient cripple?
9,when Bàbá d'ùbí bibi?,when does the daddy participate undesirable?


In [5]:
import pandas as pd
import random

# Load the data
data = pd.read_csv('igala_updated_with_pos.csv')

# --- Helper Functions ---


def get_random_word(df, pos_tag):
    words = df[df["POS"].str.contains(pos_tag, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None


def get_english_translation(df, igala_word):
    translation = df[df["Igala"] == igala_word]["English"].tolist()
    if translation:
        return translation[0]
    else:
        return None


# --- Extract Interrogatives ---
interrogative_tags = ["WP", "WRB", "WDT"]
interrogatives_df = data[data["POS"].isin(interrogative_tags)]

# --- Generate Questions ---
num_samples = 1000
synthetic_data = []

for _ in range(num_samples):
    interrogative = get_random_word(
        interrogatives_df, "|".join(interrogative_tags))

    noun = get_random_word(data, r"\bNN\b")  # Noun
    verb = get_random_word(data, r"\bVB\b")  # Verb
    adjective = get_random_word(data, r"\bJJ\b")  # Adjective (optional)

    # Ensure valid words were found
    if interrogative and noun and verb:
        # Construct Igala phrase
        if adjective:
            igala_phrase = f"{interrogative} {noun} {verb} {adjective}?"
        else:
            igala_phrase = f"{interrogative} {noun} {verb}?"

        # Construct English phrase
        if adjective:
            english_phrase = f"{get_english_translation(data, interrogative)} does the {get_english_translation(data, noun)} {get_english_translation(data, verb)} the {get_english_translation(data, adjective)}?"
        else:
            english_phrase = f"{get_english_translation(data, interrogative)} does the {get_english_translation(data, noun)} {get_english_translation(data, verb)}?"

        # Append to synthetic data
        synthetic_data.append([igala_phrase, english_phrase])

# Create DataFrame from the generated questions
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few sample questions
synthetic_df.head(10)

Unnamed: 0,Igala,English
0,kù tọ́ dú dàbì neke?,who does the resultant go back the can?
1,énẹ́ iji-úná néjú neke?,which does the log believe the can?
2,úgbò Ọ́wọ́ gbà jẹjú?,Where does the Hand give the assemble?
3,ugbo áhímú lí éjénẹ́ẹ̀?,where does the psychiatric see the popular?
4,énẹ́ gbómù enẹ jálíí?,which does the respond have the obviously?
5,ugbo wúùùlù ñọ́rú gbẹ́jú?,where does the dizzy settle the fraudulent?
6,kéee gbà ñọ́rú mà?,why does the give settle the they?
7,kéee hìẹ̀dọ̀ wánẹ́ẹ̀ bàkú gbẹ́gā?,why does the calm must be the dynamic?
8,kalí búla tā ẹ̀nyọ̀?,Which does the crash impose the good?
9,ugbo n’úká gbà ẹ̀gwejì?,where does the attack give the Double?


In [7]:
import pandas as pd
import random

# Load the data
data = pd.read_csv('igala_updated_with_pos.csv')

# Define themes and corresponding word filters
themes = {
    "Family": {"nouns": ["NN", "NNS"], "verbs": ["VB", "VBP"], "adjectives": ["JJ"]},
    "Nature": {"nouns": ["NN", "NNS"], "verbs": ["VB", "VBP"], "adjectives": ["JJ"]},
    "Daily Activities": {"nouns": ["NN", "NNS"], "verbs": ["VB", "VBP"], "adjectives": ["JJ"]},
    "Greetings": {"nouns": ["NN"], "verbs": ["VB", "VBP"], "adjectives": ["JJ"]},
    "Classroom": {"nouns": ["NN", "NNS"], "verbs": ["VB", "VBP"], "adjectives": ["JJ"]},
    "Market": {"nouns": ["NN", "NNS"], "verbs": ["VB", "VBP"], "adjectives": ["JJ"]}
}


def get_random_word(df, pos_tag):
    words = df[df["POS"].str.contains(pos_tag, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None


def get_english_translation(df, igala_word):
    translation = df[df["Igala"] == igala_word]["English"].tolist()
    if translation:
        return translation[0]
    else:
        return None


# Choose a theme
# You can change this to "Nature", "Daily Activities", etc.
selected_theme = "Greetings"

# Filter words based on the selected theme
noun_pos_tags = themes[selected_theme]["nouns"]
verb_pos_tags = themes[selected_theme]["verbs"]
adjective_pos_tags = themes[selected_theme]["adjectives"]

# Generate phrases for the selected theme
num_samples = 10
synthetic_data = []

for _ in range(num_samples):
    interrogative = get_random_word(data, r"\b(WP|WRB|WDT)\b")
    noun = get_random_word(data, "|".join(noun_pos_tags))
    verb = get_random_word(data, "|".join(verb_pos_tags))
    adjective = get_random_word(data, "|".join(adjective_pos_tags))

    if interrogative and noun and verb:
        if adjective:
            igala_phrase = f"{interrogative} {noun} {verb} {adjective}?"
        else:
            igala_phrase = f"{interrogative} {noun} {verb}?"

        english_phrase = f"{get_english_translation(data, interrogative)} does the {get_english_translation(data, noun)} {get_english_translation(data, verb)}"
        if adjective:
            english_phrase += f" the {get_english_translation(data, adjective)}?"
        else:
            english_phrase += "?"

        synthetic_data.append([igala_phrase, english_phrase])

# Create DataFrame from the generated questions
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few sample questions
synthetic_df.head(10)

  words = df[df["POS"].str.contains(pos_tag, na=False)]["Igala"].tolist()


Unnamed: 0,Igala,English
0,ugbo úgbá kọ̀ káálọ́?,where does the basin opposed the hypothetical?
1,kéee lù lèkwú únyí?,why does the smell will die the house?
2,kéee kpọ́tii ché ágbojì?,why does the thick made the stronger?
3,ugbo Ẹ̀pìlì kpégā ígbélẹ́?,where does the April reminded the historical?
4,ábú ágbee rá ọjó?,how does the injury run the seasonal?
5,kálí gwúné kẹ̀dọ̀nó ìgbẹ̀lẹ́ ?,which does the accuse hoping the young girl?
6,kù úñà nẹ d’éjìjì?,who does the seat own the nervous?
7,édú ch’ókpò égbánẹ́ẹ̀ jẹjú?,Function does the afraid sweeping the assemble?
8,édú ẹnẹ́káàlù d’ọ́ d’éjìjì?,Function does the speaker embodied the nervous?
9,kéee ùkpẹ̄ jẹ́nyú jó?,why does the chapter assure the burn?


In [8]:
pip install spacy

Collecting spacy
  Downloading spacy-3.7.6-cp310-cp310-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp310-cp310-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp310-cp310-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp310-cp310-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.5-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp310-cp310-win_amd64.

In [11]:
python -m spacy download en_core_web_sm


SyntaxError: invalid syntax (1553972540.py, line 1)

In [12]:
import spacy
import random
import pandas as pd

# Load the spaCy English model for parsing and tagging
nlp = spacy.load('en_core_web_sm')

# Load your dataset
data = pd.read_csv('igala_updated_with_pos.csv')

# Helper function to get words based on POS


def get_word_by_pos(df, pos):
    words = df[df["POS"].str.contains(pos, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None

# Generate sentences using templates and POS information


def generate_sentence(template, words_dict):
    doc = nlp(template)
    sentence = []

    for token in doc:
        if token.pos_ in words_dict:
            word = get_word_by_pos(data, words_dict[token.pos_])
            if word:
                sentence.append(word)
            else:
                sentence.append(token.text)
        else:
            sentence.append(token.text)

    return " ".join(sentence)


# Define templates and corresponding POS tags
templates = [
    "What does the NOUN VERB?",
    "Who ADJ the NOUN?",
    "Where does the NOUN VERB ADJ?"
]

# Mapping POS tags in the template to the dataset POS tags
words_dict = {
    'NOUN': 'NN',
    'VERB': 'VB',
    'ADJ': 'JJ'
}

# Generate synthetic data
num_samples = 10
synthetic_data = []

for _ in range(num_samples):
    template = random.choice(templates)
    igala_sentence = generate_sentence(template, words_dict)

    # Here, we assume a similar template-based approach for English translation
    # You may need to translate Igala words to English after generation
    english_translation = "Translate the sentence to English here"

    synthetic_data.append([igala_sentence, english_translation])

# Create DataFrame from the generated questions
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few samples
synthetic_df.head(10)

Unnamed: 0,Igala,English
0,Who ADJ the NOUN ?,Translate the sentence to English here
1,Where nẹ́ the NOUN VERB ADJ ?,Translate the sentence to English here
2,What does the ọ́kọ àkpàtì ?,Translate the sentence to English here
3,What does the ábọ́-ọ́gwù ìchèkpúlù ?,Translate the sentence to English here
4,What does the édìbò búla ?,Translate the sentence to English here
5,Who ADJ the NOUN ?,Translate the sentence to English here
6,What does the ánẹ́ẹ̀ ígbélí ?,Translate the sentence to English here
7,Where gwá the NOUN VERB ADJ ?,Translate the sentence to English here
8,Where lé the NOUN VERB ADJ ?,Translate the sentence to English here
9,What does the éfúù àmẹ̀ ?,Translate the sentence to English here


In [13]:
import spacy
import random
import pandas as pd

# Load the spaCy English model for parsing and tagging
nlp = spacy.load('en_core_web_sm')

# Load your dataset
data = pd.read_csv('igala_updated_with_pos.csv')

# Helper function to get words based on POS


def get_word_by_pos(df, pos):
    words = df[df["POS"].str.contains(pos, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None

# Generate sentences using templates and POS information


def generate_sentence(template, words_dict):
    doc = nlp(template)
    sentence = []

    for token in doc:
        if token.text.upper() in words_dict:
            word = get_word_by_pos(data, words_dict[token.text.upper()])
            if word:
                sentence.append(word)
            else:
                sentence.append(token.text)
        else:
            sentence.append(token.text)

    return " ".join(sentence)


# Define templates and corresponding POS tags
templates = [
    "What does the NOUN VERB?",
    "Who ADJ the NOUN?",
    "Where does the NOUN VERB ADJ?"
]

# Mapping POS tags in the template to the dataset POS tags
words_dict = {
    'NOUN': 'NN',
    'VERB': 'VB',
    'ADJ': 'JJ'
}

# Generate synthetic data
num_samples = 10
synthetic_data = []

for _ in range(num_samples):
    template = random.choice(templates)
    igala_sentence = generate_sentence(template, words_dict)

    # Here, you may need to translate Igala words to English after generation
    english_translation = "Translate the sentence to English here"

    synthetic_data.append([igala_sentence, english_translation])

# Create DataFrame from the generated questions
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few samples
synthetic_df.head(10)

Unnamed: 0,Igala,English
0,Who lotoo the ójáanẹ̀ ?,Translate the sentence to English here
1,Who àbígbo the tílílí ?,Translate the sentence to English here
2,Where does the ímótò-ejọ́mi dúgbo jì ?,Translate the sentence to English here
3,What does the égbìgbì kà ?,Translate the sentence to English here
4,Where does the éjú chẹ́ gbó ?,Translate the sentence to English here
5,What does the àmíbāálú che ?,Translate the sentence to English here
6,Where does the áchíkwúù ch’ámínáà étító ?,Translate the sentence to English here
7,Who ọ́gbọ́n the ónúgò ?,Translate the sentence to English here
8,Where does the ẹ̀kọ́-ùmà ténẹ́ ẹ̀kpíkpà ?,Translate the sentence to English here
9,Where does the ìgbẹ̀lẹ́ gbà lìlẹ̀ ?,Translate the sentence to English here


In [14]:
import spacy
import random
import pandas as pd

# Load the spaCy English model for parsing and tagging
nlp = spacy.load('en_core_web_sm')

# Load your dataset
data = pd.read_csv('igala_updated_with_pos.csv')

# Helper function to get words based on POS


def get_word_by_pos(df, pos, context=None):
    words = df[df["POS"].str.contains(pos, na=False)]["Igala"].tolist()
    # Filter further by context if provided
    if context:
        words = [word for word in words if word in context]
    if words:
        return random.choice(words)
    else:
        return None

# Generate sentences using templates and POS information


def generate_sentence(template, words_dict):
    doc = nlp(template)
    sentence = []
    context = None

    for token in doc:
        if token.text.upper() in words_dict:
            word = get_word_by_pos(
                data, words_dict[token.text.upper()], context)
            if word:
                sentence.append(word)
                context = word  # Update context
            else:
                sentence.append(token.text)
        else:
            sentence.append(token.text)

    return " ".join(sentence)


# Define templates and corresponding POS tags
templates = [
    "What does the NOUN VERB?",
    "Who ADJ the NOUN?",
    "Where does the NOUN VERB ADJ?"
]

# Mapping POS tags in the template to the dataset POS tags
words_dict = {
    'NOUN': 'NN',
    'VERB': 'VB',
    'ADJ': 'JJ'
}

# Generate synthetic data
num_samples = 10
synthetic_data = []

for _ in range(num_samples):
    template = random.choice(templates)
    igala_sentence = generate_sentence(template, words_dict)

    # Translate the Igala sentence into English
    # Update with actual translation logic
    english_translation = "Translate the sentence to English here"

    synthetic_data.append([igala_sentence, english_translation])

# Create DataFrame from the generated questions
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few samples
synthetic_df.head(10)

Unnamed: 0,Igala,English
0,Who mà the mà ?,Translate the sentence to English here
1,Who òfeje-í the NOUN ?,Translate the sentence to English here
2,Who ọ́kọ́ the ọ́kọ́ ?,Translate the sentence to English here
3,What does the d’ọ́jọ́ jó ?,Translate the sentence to English here
4,What does the ódú dú ?,Translate the sentence to English here
5,Where does the áyé VERB ADJ ?,Translate the sentence to English here
6,Who kpákáá the kpá ?,Translate the sentence to English here
7,Where does the ọ́màjùwẹ mà mà ?,Translate the sentence to English here
8,Where does the ẹ́gẹ́ VERB ADJ ?,Translate the sentence to English here
9,Who ọ́jọ́-ọ́jọ́ the jó ?,Translate the sentence to English here


In [15]:
import spacy
import random
import pandas as pd

# Load the spaCy English model for parsing and tagging
nlp = spacy.load('en_core_web_sm')

# Load your dataset
data = pd.read_csv('igala_updated_with_pos.csv')

# Helper function to get words based on POS


def get_word_by_pos(df, pos):
    words = df[df["POS"].str.contains(pos, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None

# Generate sentences using templates and POS information


def generate_sentence(template, words_dict):
    doc = nlp(template)
    sentence = []

    for token in doc:
        token_upper = token.text.upper()
        if token_upper in words_dict:
            word = get_word_by_pos(data, words_dict[token_upper])
            if word and word not in sentence:  # Ensure the word is unique within the sentence
                sentence.append(word)
            else:
                sentence.append(f"<missing {token_upper}>")
        else:
            sentence.append(token.text)

    return " ".join(sentence)


# Define templates and corresponding POS tags
templates = [
    "What does the NOUN VERB?",
    "Who ADJ the NOUN?",
    "Where does the NOUN VERB ADJ?"
]

# Mapping POS tags in the template to the dataset POS tags
words_dict = {
    'NOUN': 'NN',
    'VERB': 'VB',
    'ADJ': 'JJ'
}

# Generate synthetic data
num_samples = 10
synthetic_data = []

for _ in range(num_samples):
    template = random.choice(templates)
    igala_sentence = generate_sentence(template, words_dict)

    # Placeholder for English translation (to be implemented)
    english_translation = "Translate the sentence to English here"

    synthetic_data.append([igala_sentence, english_translation])

# Create DataFrame from the generated questions
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few samples
synthetic_df.head(10)


Unnamed: 0,Igala,English
0,Who kẹ̀chùù the gbéjú ?,Translate the sentence to English here
1,What does the gbà gbá ?,Translate the sentence to English here
2,Who kpé the àwọ̀nyí ?,Translate the sentence to English here
3,What does the ígẹ̀dẹ́ kà ?,Translate the sentence to English here
4,Where does the jálí já ẹ̀gwù ?,Translate the sentence to English here
5,Who ẹ́kẹ́jì the li ?,Translate the sentence to English here
6,What does the k’úkwú jẹ́ dú ?,Translate the sentence to English here
7,What does the àmómi égbánẹ́ẹ̀ ?,Translate the sentence to English here
8,Who bíẹ́nẹ́-bíẹ́nẹ the àmìlẹ̀ ?,Translate the sentence to English here
9,Where does the bílú ùgbéjú gbẹ́gā ?,Translate the sentence to English here


In [16]:
import spacy
import random
import pandas as pd

# Load the spaCy English model for parsing and tagging
nlp = spacy.load('en_core_web_sm')

# Load your dataset
data = pd.read_csv('igala_updated_with_pos.csv')

# Helper function to get words based on POS


def get_word_by_pos(df, pos):
    words = df[df["POS"].str.contains(pos, na=False)]
    if not words.empty:
        return words.sample(1).iloc[0]
    return None

# Generate sentences using templates and POS information


def generate_sentence(template, words_dict):
    doc = nlp(template)
    igala_sentence = []
    english_sentence = []

    for token in doc:
        token_upper = token.text.upper()
        if token_upper in words_dict:
            word_data = get_word_by_pos(data, words_dict[token_upper])
            if word_data is not None:
                igala_word = word_data['Igala']
                english_word = word_data['English']
                igala_sentence.append(igala_word)
                english_sentence.append(english_word)
            else:
                igala_sentence.append(f"<missing {token_upper}>")
                english_sentence.append(f"<missing {token_upper}>")
        else:
            igala_sentence.append(token.text)
            english_sentence.append(token.text)

    return " ".join(igala_sentence), " ".join(english_sentence)


# Define templates and corresponding POS tags
templates = [
    "What does the NOUN VERB?",
    "Who ADJ the NOUN?",
    "Where does the NOUN VERB ADJ?"
]

# Mapping POS tags in the template to the dataset POS tags
words_dict = {
    'NOUN': 'NN',
    'VERB': 'VB',
    'ADJ': 'JJ'
}

# Generate synthetic data
num_samples = 10
synthetic_data = []

for _ in range(num_samples):
    template = random.choice(templates)
    igala_sentence, english_translation = generate_sentence(
        template, words_dict)

    synthetic_data.append([igala_sentence, english_translation])

# Create DataFrame from the generated questions
synthetic_df = pd.DataFrame(synthetic_data, columns=["Igala", "English"])

# Display a few samples
synthetic_df.head(10)

Unnamed: 0,Igala,English
0,Who bílẹ́wá the jẹ́ ?,Who modern the agree ?
1,Where does the bà d'ùbí dárú ?,Where does the curve participated separate ?
2,Where does the ẹgwa èbíè nẹ́ Ígáláà ?,Where does the seventeen constructing indigeno...
3,What does the ọ́kọ nẹ́ ?,What does the husband hiring ?
4,Where does the àdárú kpíjèlè gbẹ́gbẹ́dẹ́ ?,Where does the separation freezing customary ?
5,Where does the àbímọ́tọ d'ùbí jí ?,Where does the family participate unified ?
6,Where does the Ọ́jọ́ chánẹ́ éfù ?,Where does the God started internal ?
7,Who ìgbẹ̀lẹ́ the òchìkapa ?,Who young girl the rice ?
8,Where does the ójáanẹ́ tẹ̄ e̩ko̩ ?,Where does the State defined full ?
9,Who kàkpọ́ọ́ the ñwu ?,Who substantial the itch ?


In [19]:
import pandas as pd
import random

# Load the data
data = pd.read_csv('igala_updated_with_pos.csv')

# --- Helper Functions ---


def get_random_word(df, pos_tag):
    """Gets a random word from the DataFrame matching the given POS tag."""
    words = df[df["POS"].str.contains(pos_tag, na=False)]["Igala"].tolist()
    if words:
        return random.choice(words)
    else:
        return None


def get_english_translation(df, igala_word):
    """Gets the English translation of a given Igala word."""
    translation = df[df["Igala"] == igala_word]["English"].tolist()
    if translation:
        return translation[0]
    else:
        return None


# --- Generate Story Elements ---
characters = [get_random_word(data, r"\bNN\b")
              for _ in range(2)]  # Nouns for characters
setting = get_random_word(data, r"\bNN\b")  # Noun for setting
objects = [get_random_word(data, r"\bNN\b")
           for _ in range(2)]  # Nouns for objects
actions = [get_random_word(data, r"\bVB\b")
           for _ in range(3)]  # Verbs for actions
descriptions = [get_random_word(data, r"\bJJ\b")
                for _ in range(2)]  # Adjectives for descriptions

# --- Construct Story ---
story = []

# Beginning
story.append(f"{characters[0]} gbè {setting}.")
story.append(f"Ó ní {descriptions[0]} {objects[0]}.")

# Middle
story.append(f"{characters[1]} {actions[0]} {objects[1]}.")
story.append(f"Ọ̀jọ̀ má {actions[1]}.")

# End
story.append(f"Ṣùgbọ́n {characters[0]} {actions[2]} {characters[1]}.")
story.append(f"Ìtàn náà ọ̀pè {descriptions[1]} {setting}.")

# Construct English translation
english_story = []

english_story.append(
    f"The {get_english_translation(data, characters[0])} went to the {get_english_translation(data, setting)}.")
english_story.append(
    f"There was a {get_english_translation(data, descriptions[0])} {get_english_translation(data, objects[0])}.")
english_story.append(
    f"The {get_english_translation(data, characters[1])} {get_english_translation(data, actions[0])} the {get_english_translation(data, objects[1])}.")
english_story.append(
    f"The rain did {get_english_translation(data, actions[1])}.")
english_story.append(
    f"But the {get_english_translation(data, characters[0])} {get_english_translation(data, actions[2])} the {get_english_translation(data, characters[1])}.")
english_story.append(
    f"The story ended with a {get_english_translation(data, descriptions[1])} {get_english_translation(data, setting)}.")

# Combine Igala and English story
igala_story = " ".join(story)
english_story_text = " ".join(english_story)

# Output the stories
igala_story, english_story_text

('gbà gbè íkọ́líkọ́ . Ó ní uná ákpata. dẹ̀ yọ álí. Ọ̀jọ̀ má dágba. Ṣùgbọ́n gbà kà dẹ̀. Ìtàn náà ọ̀pè ágọ́fó íkọ́líkọ́ .',
 'The give went to the horror. There was a electric bridge. The configuration save the custom. The rain did may be. But the give say the configuration. The story ended with a formative horror.')