# 01. Prepare the source knowledge base

```bash
sudo apt install enchant-2
```

In [1]:
import os

import json
import pandas as pd
from tqdm.auto import tqdm

import enchant

## Functions

In [2]:
def clean_text(text: str) -> str:
    # only clean the text is it is not empty
    if not text:
        return text
        
    d = enchant.Dict("en_US")
    text = text.replace(".", ". ").replace(",", ", ").replace("  ", " ")

    words = text.split()
    normalized_words = []
    for word in words:
        postfix = ""
        if word[-1] == ".":
            postfix = "."
            word = word[:-1]
        elif word[-1] == ",":
            postfix = ","
            word = word[:-1]

        final_word = word
        if (len(word) > 1):
            if not d.check(word):
                suggestions = d.suggest(word)
                if suggestions:
                    if suggestions[0].replace(" ", "") == word:
                        final_word = suggestions[0]
    
        normalized_words.append(final_word+postfix)

    return " ".join(normalized_words)

# test the function
# #text = "This is a sentence with concatenatedwords."
# text = "[music]In this mini-lecture, we will consider multiagent environmentswhere each agent has to consideractions by other agents and their effects. We will consider a special case, a competitive environment where agents goals are in a conflict.Such problems are called adversarial search problems or games.We will first consider deterministic observable environmentswhere agents act alternately and in which the utility valuesat the end of the game are always equal and opposite.If one agent wins the game of chess, another loses.We will start from considering a standard examplein the game series, the prisoners dilemma. This simple example shows the complexity of a multiagent environment.The example is particularly interestingbecause it demonstrates tendencies in corporate right behavior.This game was introduced by two American mathematicians in the 50s, Merrill Flood and Melvin Dresher. The rules of the game are two members of a criminal organization are arrested and imprisoned.Each prison is in a solitary cell with no means of communicating with the other.The prosecutors lack sufficient evidenceto convict the pair on principal charge, but they have enough to convict both on a lesser charge.The prosecutors offer each prisoner a bargain.Each prisoner is given the opportunity either to betray the otherby testifying that the other committed the crimeor to cooperate with the other by remaining silent.The possible outcomes are:If A and B prisoners each betrays the other, each of them serves two years in prison. If A betrays B but B remains silent, then A goes free and B will serve three years in prison.If A remains silent, but B betrays A,A will serve three years in prison and B goes free.If A and B both remain silent, both of them will serve only one year in prisonon the lesser charge. The optimal strategy is to betray the partnerbecause overall it offers a greater reward than cooperating with them. Interestingly, in the reality,humans display a systematic bias\\ towards cooperative behaviorin this and all the similar scenarios.This has been demonstrated by many studies.Another example of a multi-agent environment is a RoboCup,a soccer tournament for robots. RoboCup was introduced by Professor Hiroaki Kitana from Japan,more than 20 years ago. Im privileged to know Kitana-sensei,it is how professors are called in Japan personally.Generally in AI game research,the focus is on abstract games because it is easier to represent the game states.Physical games are rarely in the focusbecause it is difficult to represent all the states, its a more complex description,and the range of possible actions is wider. RoboCup is an example of a dynamic environment with many agents.Some of them cooperate as members of the same team with a shared goal to win the game and other members of different teams have competing goals.The robot world cup initiative is an attempt to force AI and intelligent robotics research by providing a standard problem where a wide range of technologies can be integrated and examined.Design principles of autonomous agents, multiagent collaboration, strategy acquisition, real-time reasoning, robotics, and sensor fusion.I recommend watching a video of earlier Robocup. The robots are really clumsy. However, this initiative has boosted the research in this areaand the target now is for a robot teamto beat the best human team by 2030.In this mini-lecture, we looked at multi-agent competitive environments or games.First, we considered a deterministic two-agents environment,the well-known prisoners dilemma.Then we considered a more complex dynamic situationwith many agents, robots playing a soccer game. There are many more examples of significant advancesin the game series and practice. More than 20 years ago, Deep Blue beat Russian chess master Garry Kasparov. 10 years ago, Watson won in Jeopardy. In my view, the most impressive exampleis for an artificial agent to win in the game of Go. ves.Go has long been viewed as the most challenging of classic gamesbecause of its enormous search space and the difficulty in evaluating both positions and moIn 2016, the program AlphaGo defeated the human European gold championby five to zero."
# print(text)
# print(clean_text(text))

In [3]:
def create_dataset(dir_path: str) -> pd.DataFrame:
    # Define an empty list to store the document data
    documents = []

    # Loop over all JSON files in the directory
    for filename in tqdm(os.listdir(dir_path)):
        if filename.endswith(".json"):
            # Load the JSON file and extract the relevant fields
            with open(os.path.join(dir_path, filename), "r") as f:
                data = json.load(f)
                course = data.get("course")
                topic = data.get("topic")
                title = data.get("title")
                url = data.get("url")
                transcript = clean_text(data.get("transcript"))
                # Append the document data to the list
                documents.append({"course": course, "topic": topic, "title": title, "url": url, "transcript": transcript})

    # Create a pandas DataFrame from the document data
    return pd.DataFrame(documents)

# # create the dataset
# dir_path = "/workspaces/dsm100-2022-oct/question_answering/data/input/raw/"
# create_dataset(dir_path=dir_path)


## Create Dataset

In [9]:
# create the dataset
dir_path = "/workspaces/dsm100-2022-oct/question_answering/data/input/raw/"
df_source = create_dataset(dir_path=dir_path)

print(df_source.shape)
df_source.head()

  0%|          | 0/141 [00:00<?, ?it/s]

(140, 5)


Unnamed: 0,course,topic,title,url,transcript
0,DSM100-2022-OCT,Module information,Module introduction video,https://learn.london.ac.uk/mod/page/view.php?i...,-Welcome to AI Module. Artificial intelligence...
1,DSM100-2022-OCT,Module information,Meet the team,https://learn.london.ac.uk/mod/page/view.php?i...,[music]-Welcome to the AI module. My name is L...
2,DSM100-2022-OCT,Topic 1: Introduction,Lecture: Introduction to Topic 1,https://learn.london.ac.uk/mod/page/view.php?i...,"Welcome to topic one, Introduction to AI. In t..."
3,DSM100-2022-OCT,Topic 1: Introduction,Lecture 1: Defining AI,https://learn.london.ac.uk/mod/page/view.php?i...,"[music]In this mini lecture, we will talk abou..."
4,DSM100-2022-OCT,Topic 1: Introduction,Lecture 2: Milestones in the history of AI,https://learn.london.ac.uk/mod/page/view.php?i...,In this mini-lecture with a go through the his...


In [10]:
# save the dataset
df_source.to_csv("data/output/transcripts.csv", index=False)