In [20]:
!unzip raw_data/wiki.zip -d raw_data

Archive:  raw_data/wiki.zip
  inflating: raw_data/articles.json  
  inflating: raw_data/articles.parquet  
  inflating: raw_data/articles.xlsx  


## Load the dataset from the local machine

In [2]:
from datasets import load_dataset

data_files = "raw_data/articles.parquet"

wiki_raw_dataset = load_dataset("parquet", data_files=data_files)

wiki_raw_dataset.column_names

{'train': ['ID',
  'Title',
  'URL',
  'Introduction/Summary',
  'Body',
  'Sections/Headings',
  'References',
  'Categories',
  'Infobox']}

## Reomve the unnecessary columns

In [3]:
wiki_raw_dataset_after_removing = wiki_raw_dataset.remove_columns(["URL",
                                                                   "Introduction/Summary",
                                                                   "Sections/Headings",
                                                                   "References",
                                                                   "Categories",
                                                                   "Infobox"])

In [4]:
wiki_raw_dataset_after_removing["train"][:3]

{'ID': [10, 1, 6],
 'Title': ['Canadian Society of Soil Science',
  "King Arthur's family",
  'Ralitsa Vassileva'],
 'Body': ['Canadian non-profit organization\nCanadian Society of Soil ScienceSociété Canadienne de science du solAbbreviationCSSSFormation1954[1]TypeNon-governmental organizationWebsitecsss.ca\nThe Canadian Society of Soil Science (CSSS) is a non-governmental, non-profit organization for scientists, engineers, technologists, administrators and students involved in professional soil science.[2][3] Its goal is to nurture the discipline of soil science in Canada.\n\n\nAdministration[edit]\nThe Society is administered by a 9-member Council consisting of a President, President-Elect, Past-President, Secretary, Treasurer, two Councillors (Western and Eastern Councillor), Graduate Student Representative, and the Editor of the Canadian Journal of Soil Science. Council meets once annually, at the Annual Meeting, and conducts ongoing e-mail correspondence and business motions throu

## Shuffle the dataset

In [5]:
shuffled_wiki_raw_dataset = wiki_raw_dataset_after_removing["train"].shuffle(seed=42)

shuffled_wiki_raw_dataset[0]

{'ID': 1536,
 'Title': 'San Francisco Open',
 'Body': 'Golf tournament\nThe San Francisco Open was a golf tournament played in the San Francisco area. It was played at a number of different courses in the December to February period. From 1930 to 1941 it was a match-play event before becoming a 72-hole stroke play event from 1942 to 1946.\nA non-PGA Tour event was played in May 1954 at Lake Merced Golf Club and was won by Shelley Mayfield. It was played over 54 holes and had prize money of $10,000.\n\nWinners[edit]\n\n\nYear\nPlayer\nCountry\nVenue\nScore\nTo par\nMarginof victory\nRunner-up\nWinner\'sshare ($)\nRef\n\n\nSan Francisco Open\n\n\n1946\nByron Nelson\n\xa0United States\nOlympic Club\n283\n−1\n9 strokes\n Herman Barron\n3,000\n[1]\n\n\n1944(Dec)\nByron Nelson\n\xa0United States\nHarding Park\n281\n−7\n1 stroke\n Jim Ferrier\n2,666\n[2]\n\n\nSan Francisco Victory Open\n\n\n1944(Jan)\nByron Nelson\n\xa0United States\nHarding Park\n275\n−13\n6 strokes\n Jug McSpaden\n2,400\n[3

## Compute the `Body` length

In [6]:
def compute_body_length(example):
    return {"Body_length": len(example["Body"].split())}

shuffled_wiki_raw_dataset = shuffled_wiki_raw_dataset.map(lambda dataset: {"Body_length": len(dataset["Body"].split())})

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [7]:
shuffled_wiki_raw_dataset[0:3]

{'ID': [1536, 1186, 3372],
 'Title': ['San Francisco Open', 'Rosenborg tram stop', 'Agnes Mukabaranga'],
 'Body': ['Golf tournament\nThe San Francisco Open was a golf tournament played in the San Francisco area. It was played at a number of different courses in the December to February period. From 1930 to 1941 it was a match-play event before becoming a 72-hole stroke play event from 1942 to 1946.\nA non-PGA Tour event was played in May 1954 at Lake Merced Golf Club and was won by Shelley Mayfield. It was played over 54 holes and had prize money of $10,000.\n\nWinners[edit]\n\n\nYear\nPlayer\nCountry\nVenue\nScore\nTo par\nMarginof victory\nRunner-up\nWinner\'sshare ($)\nRef\n\n\nSan Francisco Open\n\n\n1946\nByron Nelson\n\xa0United States\nOlympic Club\n283\n−1\n9 strokes\n Herman Barron\n3,000\n[1]\n\n\n1944(Dec)\nByron Nelson\n\xa0United States\nHarding Park\n281\n−7\n1 stroke\n Jim Ferrier\n2,666\n[2]\n\n\nSan Francisco Victory Open\n\n\n1944(Jan)\nByron Nelson\n\xa0United States

## Remove examples that contains Empty `Body`

In [8]:
shuffled_wiki_raw_dataset = shuffled_wiki_raw_dataset.filter(lambda dataset: dataset["Body_length"] is not 0)

shuffled_wiki_raw_dataset

  shuffled_wiki_raw_dataset = shuffled_wiki_raw_dataset.filter(lambda dataset: dataset["Body_length"] is not 0)


Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['ID', 'Title', 'Body', 'Body_length'],
    num_rows: 4955
})

## remove article `Body` that contain fewer than 30 words

In [9]:
shuffled_wiki_raw_dataset = shuffled_wiki_raw_dataset.filter(lambda dataset: dataset["Body_length"] >= 30)

shuffled_wiki_raw_dataset

Filter:   0%|          | 0/4955 [00:00<?, ? examples/s]

Dataset({
    features: ['ID', 'Title', 'Body', 'Body_length'],
    num_rows: 3988
})

##  we need to deal with is the presence of HTML character codes in our `Body` and get clean text

In [10]:
import html

clean_wiki = shuffled_wiki_raw_dataset.map(lambda dataset: {"Body": html.unescape(dataset["Body"])}, batched=True)

clean_wiki["Body"][0]

Map:   0%|          | 0/3988 [00:00<?, ? examples/s]

'Golf tournament\nThe San Francisco Open was a golf tournament played in the San Francisco area. It was played at a number of different courses in the December to February period. From 1930 to 1941 it was a match-play event before becoming a 72-hole stroke play event from 1942 to 1946.\nA non-PGA Tour event was played in May 1954 at Lake Merced Golf Club and was won by Shelley Mayfield. It was played over 54 holes and had prize money of $10,000.\n\nWinners[edit]\n\n\nYear\nPlayer\nCountry\nVenue\nScore\nTo par\nMarginof victory\nRunner-up\nWinner\'sshare ($)\nRef\n\n\nSan Francisco Open\n\n\n1946\nByron Nelson\n\xa0United States\nOlympic Club\n283\n−1\n9 strokes\n Herman Barron\n3,000\n[1]\n\n\n1944(Dec)\nByron Nelson\n\xa0United States\nHarding Park\n281\n−7\n1 stroke\n Jim Ferrier\n2,666\n[2]\n\n\nSan Francisco Victory Open\n\n\n1944(Jan)\nByron Nelson\n\xa0United States\nHarding Park\n275\n−13\n6 strokes\n Jug McSpaden\n2,400\n[3]\n\n\nSan Francisco Open\n\n\n1943\nNo tournament\n\n

## From Datasets to DataFrames and back


In [11]:
clean_wiki.set_format("pandas")

clean_wiki_df = clean_wiki[:]

clean_wiki_df.head()

Unnamed: 0,ID,Title,Body,Body_length
0,1536,San Francisco Open,Golf tournament\nThe San Francisco Open was a ...,1058
1,3372,Agnes Mukabaranga,Rwandan politician\nAgnes Mukabaranga\nAgnes M...,376
2,2840,The Indian Queen (opera),Semi-opera by Henry Purcell\nFor other uses of...,907
3,3869,Roads (novel),Fantasy novel by Seabury Quinn\nRoads Dust-jac...,295
4,2230,David Wootton (historian),British historian of science (born 1952)\n\n\n...,503


In [12]:
clean_wiki_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3988 entries, 0 to 3987
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           3988 non-null   int64 
 1   Title        3988 non-null   object
 2   Body         3988 non-null   object
 3   Body_length  3988 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 124.8+ KB


In [13]:
frequencies = (
    clean_wiki_df["Title"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"frequency": "Title", "Title": "frequency"}))

frequencies.head(5)

Unnamed: 0,frequency,count
0,Daniel L. Simmons,2
1,2013–14 FC Basel season,2
2,2024 Kentucky Republican presidential primary,2
3,Novy Vostok,2
4,Rangsazi Iran,2


In [14]:
clean_wiki.reset_format()

## Split the dataset into train & test

In [15]:
wiki = clean_wiki.train_test_split(train_size=0.7, seed=42)

wiki

DatasetDict({
    train: Dataset({
        features: ['ID', 'Title', 'Body', 'Body_length'],
        num_rows: 2791
    })
    test: Dataset({
        features: ['ID', 'Title', 'Body', 'Body_length'],
        num_rows: 1197
    })
})

## Save the dataset with a various formats

In [16]:
for extension in ["jsonl", "csv", "parquet"]:
  for split, dataset in wiki.items():
    dataset.to_json(f"datasets/{extension}/wiki-{split}.{extension}")

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

## Test loading the dataset that we have prepared

In [18]:
data_files = {"train": "datasets/jsonl/wiki-train.jsonl",
              "test": "datasets/jsonl/wiki-test.jsonl"}

drug_dataset_reloaded = load_dataset("json", data_files=data_files)

drug_dataset_reloaded

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'Title', 'Body', 'Body_length'],
        num_rows: 2791
    })
    test: Dataset({
        features: ['ID', 'Title', 'Body', 'Body_length'],
        num_rows: 1197
    })
})

In [19]:
drug_dataset_reloaded["train"][0]

{'ID': 1271,
 'Title': "1984 TAAC men's basketball tournament",
 'Body': 'Basketball tournament\n\n\n1984 TAAC men\'s basketball\xa0tournamentClassificationDivision ISeason1983–84Teams8SiteSpring Branch ColiseumHouston, TexasChampionsHouston Baptist (1st title)Winning coachGene Iba (1st title)MVPCraig Beard (Samford)TAAC men\'s basketball\xa0tournaments←\xa019831985\xa0→\n\n\n1983–84 Trans America Athletic Conference men\'s basketball standings\n\n\nvte\nConf\n\n\nOverall\n\n\nTeam\nW\n\xa0\nL\n\xa0\nPCT\n\n\nW\n\xa0\nL\n\xa0\nPCT\n\n\nHouston Baptist †\n11\n–\n3\n\xa0\n.786\n\n\n24\n–\n7\n\n\xa0\n.774\n\n\nSamford\n10\n–\n4\n\xa0\n.714\n\n\n22\n–\n8\n\n\xa0\n.733\n\n\nGeorgia Southern\n8\n–\n6\n\xa0\n.571\n\n\n16\n–\n12\n\n\xa0\n.571\n\n\nArkansas–Little Rock\n7\n–\n7\n\xa0\n.500\n\n\n14\n–\n15\n\n\xa0\n.483\n\n\nCentenary\n7\n–\n7\n\xa0\n.500\n\n\n12\n–\n16\n\n\xa0\n.429\n\n\nMercer\n6\n–\n8\n\xa0\n.429\n\n\n14\n–\n14\n\n\xa0\n.500\n\n\nHardin–Simmons\n5\n–\n9\n\xa0\n.357\n\n\n9\n–\n