In [1]:
! pip install datasets transformers[sentencepiece]


[notice] A new release of pip available: 22.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [2]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
squad[0]

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [15]:
# Shuffles the dataset with a seed to keep consistent
squad_shuffled = squad.shuffle(seed=666)
squad_shuffled[0]

{'id': '5727cc873acd2414000deca9',
 'title': 'Oklahoma',
 'context': 'Oklahoma is the 20th largest state in the United States, covering an area of 69,898 square miles (181,035 km2), with 68,667 square miles (177847 km2) of land and 1,281 square miles (3,188 km2) of water. It is one of six states on the Frontier Strip and lies partly in the Great Plains near the geographical center of the 48 contiguous states. It is bounded on the east by Arkansas and Missouri, on the north by Kansas, on the northwest by Colorado, on the far west by New Mexico, and on the south and near-west by Texas.',
 'question': 'Where does Oklahoma rank by land area?',
 'answers': {'text': ['20th'], 'answer_start': [16]}}

In [16]:
# Do a test & train data set split with 10-90 distribution
dataset = squad.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 78839
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 8760
    })
})

In [17]:
# Get only a particular set of values 
indices = [0, 10, 20, 40, 80]
examples = squad.select(indices)
examples[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [18]:
# Shuffles them then selects a range of values
sample = squad.shuffle().select(range(5))
sample[0]

{'id': '572e889acb0c0d14000f1266',
 'title': 'Red',
 'context': 'Red is the color at the end of the spectrum of visible light next to orange and opposite violet. Red color has a predominant light wavelength of roughly 620–740 nanometres. Red is one of the additive primary colors of visible light, along with green and blue, which in Red Green Blue (RGB) color systems are combined to create all the colors on a computer monitor or television screen. Red is also one of the subtractive primary colors, along with yellow and blue, of the RYB color space and traditional color wheel used by painters and artists.',
 'question': 'Which colors are combined to make all other colors? ',
 'answers': {'text': ['Red Green Blue'], 'answer_start': [269]}}

In [20]:
# Get all data that has a title starting with the character "L". Filters rows
squad_filtered = squad.filter(lambda x : x["title"].startswith("L"))
squad_filtered[0]

{'id': '56de0fef4396321400ee2583',
 'title': 'Lighting',
 'context': 'Lighting or illumination is the deliberate use of light to achieve a practical or aesthetic effect. Lighting includes the use of both artificial light sources like lamps and light fixtures, as well as natural illumination by capturing daylight. Daylighting (using windows, skylights, or light shelves) is sometimes used as the main source of light during daytime in buildings. This can save energy in place of using artificial lighting, which represents a major component of energy consumption in buildings. Proper lighting can enhance task performance, improve the appearance of an area, or have positive psychological effects on occupants.',
 'question': 'What is used a main source of light for a building during the day?',
 'answers': {'text': ['Daylighting'], 'answer_start': [245]}}

In [21]:
# Renames a column to something else in the features
squad.rename_column("context", "passages")

Dataset({
    features: ['id', 'title', 'passages', 'question', 'answers'],
    num_rows: 87599
})

In [23]:
# Remove particular columns/features from the data
squad.remove_columns(["id", "title"])

Dataset({
    features: ['context', 'question', 'answers'],
    num_rows: 87599
})

In [24]:
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [26]:
# Will collapse nested columns
squad.flatten()

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
    num_rows: 87599
})

In [29]:
def lowercase_title(example):
    return {"title": example["title"].lower()}

# Will lowercase all titles with a seperate function
squad_lowercase = squad.map(lowercase_title)
# Peek at random sample
squad_lowercase.shuffle(seed=42)["title"][:5]

['egypt',
 'ann_arbor,_michigan',
 'rule_of_law',
 'samurai',
 'group_(mathematics)']

In [43]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_title(example):
    return tokenizer(example["title"])
# Uses mutithreads to batch titles
squad.map(tokenize_title, batched=True, batch_size=500)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask'],
    num_rows: 87599
})

In [44]:
squad_lowercase = squad.map(lowercase_title)
# Peek at random sample
squad_lowercase.shuffle(seed=42)["title"][:10]

['egypt',
 'ann_arbor,_michigan',
 'rule_of_law',
 'samurai',
 'group_(mathematics)',
 'british_empire',
 'dwight_d._eisenhower',
 'near_east',
 'infection',
 'insect']