# Extraction Test Generation

Used for preparing test cases for the system

In [41]:
# Test generation parameters

language = "English"
text = "MobyDickExcerpt001"
dictionary = "benjihillard"
variants = "languagetool"

notebooksDir = "../"
excerptsDir = notebooksDir + "RawData/ExtractionTextExcerpts/"

textEncoding = "utf-16"
dictionaryEncoding = "utf-16"
variantsEncoding = "utf-16"

In [42]:
import pandas as pd

In [43]:
with open(excerptsDir + language + "/" + text + ".txt", encoding = textEncoding) as my_file:
    text= my_file.read()
print(text)

Call me Ishmael. Some years ago—never mind how long precisely—having
little or no money in my purse, and nothing particular to interest me
on shore, I thought I would sail about a little and see the watery part
of the world. It is a way I have of driving off the spleen and
regulating the circulation.


In [44]:
dictionary_df = pd.read_csv(notebooksDir + "ProcessedData/Dictionaries/" + language + "/" + dictionary + ".csv", encoding = dictionaryEncoding)
variants_df = pd.read_csv(notebooksDir + "ProcessedData/Variants/" + language + "/" + variants + ".csv", encoding = variantsEncoding)

In [81]:
dictionary_df

Unnamed: 0,ID,Word,Definition
0,0,a,The first letter of the English and of many ot...
1,1,a,The name of the sixth tone in the model major ...
2,2,a,"An adjective, commonly called the indefinite a..."
3,3,a,"In each; to or for each; as, ""twenty leagues a..."
4,4,a,In; on; at; by.
...,...,...,...
176043,176043,yupon,Same as Yaupon.
176044,176044,yux,"See Yex, n."
176045,176045,yvel,Evil; ill.
176046,176046,ywar,Aware; wary.


In [97]:
def process_text(text, dictionary_df, roots_df):
    """
    Process the given text by iterating over words, looking up variants and roots, then definitions,
    and allowing the user to provide custom subwords or make selections for both variant and root definitions.

    :param text: The text to be processed.
    :param dictionary_df: DataFrame containing word definitions.
    :param roots_df: DataFrame containing word roots.
    """
    words = text.split()
    processed_words = []
    cursor = 0  # Cursor to track the position in the original text

    for word in words:
        # Check for word decomposition override
        doskip = False
        while True:
            override = input(f"Current word: '{word}'. Enter override, type 'skip' to skip, or press enter to continue: ").strip()
            if override.lower() == 'skip':
                cursor += len(word.encode('utf-16-le')) // 2  # Update cursor to skip the word
                doskip = True
                break;
            elif override:
                subwords = override.split()
                if all(text.find(subword, cursor) != -1 for subword in subwords):
                    break  # Valid subwords provided
                else:
                    print("Invalid subwords. Please try again.")
            else:
                subwords = [word]
                break
                
        if doskip:
            continue

        for subword in subwords:
            start_index = text.find(subword, cursor)
            stop_index = start_index + len(subword.encode('utf-16-le')) // 2 - 1  # UTF-16 encoding length
            cursor = stop_index + 1  # Update cursor for the next word or subword

            variant_roots = roots_df[roots_df['Variant'] == subword]['Root'].tolist()
            selected_root = variant_roots[0] if variant_roots else None

            # Allow user to provide a custom root or indicate no root
            custom_root = input(f"Enter custom root for '{subword}', type 'none' for no root, or press enter to use '{selected_root or subword}': ").strip()
            if custom_root.lower() == 'none':
                selected_root = None
            else:
                selected_root = custom_root if custom_root else (selected_root or subword)

            # Lookup definitions for the variant in dictionary DataFrame
            variant_definitions = dictionary_df[dictionary_df['Word'] == subword]
            selected_variant_definition = None
            selected_variant_definition_id = None

            if not variant_definitions.empty:
                print(f"Definitions for '{subword}':")
                for i, row in enumerate(variant_definitions.itertuples(), start=1):
                    print(f"{i}. {row.Definition}")

                # User selects the correct definition for the variant
                def_index = int(input(f"Select the correct definition for '{subword}' (number): "))
                selected_row = variant_definitions.iloc[def_index - 1]
                selected_variant_definition = selected_row.Definition
                selected_variant_definition_id = selected_row.ID
            else:
                selected_variant_definition = None
                selected_variant_definition_id = None

            # Lookup definitions for the root in dictionary DataFrame
            root_definitions = dictionary_df[dictionary_df['Word'] == selected_root] if selected_root else pd.DataFrame()
            selected_root_definition = None
            selected_root_definition_id = None

            if (subword == selected_root):
                selected_root_definition = selected_variant_definition
                selected_root_definition_id = selected_variant_definition_id
            elif not root_definitions.empty:
                print(f"Definitions for root '{selected_root}':")
                for i, row in enumerate(root_definitions.itertuples(), start=1):
                    print(f"{i}. {row.Definition}")

                # User selects the correct definition for the root
                def_index = int(input(f"Select the correct definition for root '{selected_root}' (number): "))
                selected_row = root_definitions.iloc[def_index - 1]
                selected_root_definition = selected_row.Definition
                selected_root_definition_id = selected_row.ID
            else:
                selected_root_definition = None
                selected_root_definition_id = None

            processed_word = {
                'Word': subword,
                'Start Index': start_index,
                'Stop Index': stop_index,
                'Variant Definition': selected_variant_definition,
                'Variant Definition ID': selected_variant_definition_id,
                'Root': selected_root,
                'Root Definition': selected_root_definition,
                'Root Definition ID': selected_root_definition_id
            }
            processed_words.append(processed_word)

    return processed_words


In [99]:
ret = process_text(text[15:38], dictionary_df, variants_df)

Current word: '.'. Enter override, type 'skip' to skip, or press enter to continue: skip
Current word: 'Some'. Enter override, type 'skip' to skip, or press enter to continue: 
Enter custom root for 'Some', type 'none' for no root, or press enter to use 'Some': some
Definitions for root 'some':
1. Consisting of a greater or less portion or sum; composed of a quantity or number which is not stated; -- used to express an indefinite quantity or number; as, some wine; some water; some persons.  Used also pronominally; as, I have some.
2. A certain; one; -- indicating a person, thing, event, etc., as not known individually, or designated more specifically; as, some man, that is, some one man.
3. Not much; a little; moderate; as, the censure was to some extent just.
4. About; near; more or less; -- used commonly with numerals, but formerly also with a singular substantive of time or distance; as, a village of some eighty houses; some two or three persons; some hour hence.
5. Considerable in 

In [100]:
ret

[{'Word': 'Some',
  'Start Index': 2,
  'Stop Index': 5,
  'Variant Definition': None,
  'Variant Definition ID': None,
  'Root': 'some',
  'Root Definition': 'Not much; a little; moderate; as, the censure was to some extent just.',
  'Root Definition ID': 144946},
 {'Word': 'years',
  'Start Index': 7,
  'Stop Index': 11,
  'Variant Definition': None,
  'Variant Definition ID': None,
  'Root': 'year',
  'Root Definition': 'The time of the apparent revolution of the sun trough the ecliptic; the period occupied by the earth in making its revolution around the sun, called the astronomical year; also, a period more or less nearly agreeing with this, adopted by various nations as a measure of time, and called the civil year; as, the common lunar year of 354 days, still in use among the Mohammedans; the year of 360 days, etc. In common usage, the year consists of 365 days, and every fourth year (called bissextile, or leap year) of 366 days, a day being added to February on that year, on acc