In [None]:
# !pip install datasets wikipedia

In [4]:
from datasets import load_dataset, concatenate_datasets
import wikipedia
from datasets import Dataset, DatasetDict

In [29]:


def get_wikipedia_data(topic):
    """
    Search for topic in wikipedia
    get the text and metadata for the matching categories
    return a list of that, skip errors
    """
    results = wikipedia.search(topic)
    inuit_data = []

    for result in results:
        try:
            page = wikipedia.page(result)
            categories = page.categories
            if any(topic in category.lower() for category in categories):
                inuit_data.append({
                    "title": page.title,
                    "text": page.content,
                    "categories": categories
                })
        except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError):
            continue

    return inuit_data


In [30]:

def create_huggingface_dataset(data):
    return Dataset.from_list(data)

In [31]:
# get inuit wikipedia data and print info
inuit_data = get_wikipedia_data("inuit")

inuit_dataset = create_huggingface_dataset(inuit_data)
dataset_dict = DatasetDict({"train": inuit_dataset})

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'categories'],
        num_rows: 7
    })
})


In [28]:
# get american wikipedia data and print info
amer_data = get_wikipedia_data("american")

amer_dataset = create_huggingface_dataset(amer_data)
dataset_dict = DatasetDict({"train": amer_dataset})

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'categories'],
        num_rows: 9
    })
})


In [40]:
# check # articles
num_inuit_articles = len(inuit_dataset)
num_amer_articles = len(amer_dataset)

In [41]:
num_inuit_articles, num_amer_articles

(7, 9)

In [42]:
# compare total text lengths
inuit_length = sum(len(article["text"]) for article in inuit_dataset)
amer_length = sum(len(article["text"]) for article in amer_dataset)

In [43]:
inuit_length, amer_length

(155665, 613454)

In [None]:
# sort american articles by length and select the same number as inuit articles
def get_article_length(article):
    return {"article_length": len(article["text"])}

amer_dataset = amer_dataset.map(get_article_length, batched=False, desc="Calculating article lengths")

sorted_amer_dataset = amer_dataset.sort("article_length", reverse=True)

main_amer_dataset = sorted_amer_dataset.select(range(num_inuit_articles))




Calculating article lengths: 100%|██████████████████████████████████| 9/9 [00:00<00:00, 1794.74 examples/s]


In [46]:
# calculate total text lengths
inuit_length = sum(len(article["text"]) for article in inuit_dataset)
amer_length = sum(len(article["text"]) for article in amer_dataset)
main_amer_length = sum(len(article["text"]) for article in main_amer_dataset)


In [None]:
# compare american articles text lengths
# cut to # of inuit articles and not, and inuit articles
print(f"Main american dataset total length: {main_amer_length} characters")
print(f"Inuit dataset total length: {inuit_length} characters")
print(f"American dataset total length: {amer_length} characters")

Main american dataset total length: 580995 characters
Inuit dataset total length: 155665 characters
American dataset total length: 613454 characters


In [None]:
# calculate average inuit article length
inuit_lengths = [len(article["text"]) for article in inuit_dataset]
avg_inuit_length = sum(inuit_lengths) / len(inuit_lengths)

In [50]:
# truncate american articles
def truncate_american(article):
    if len(article["text"]) > avg_inuit_length:
        article["text"] = article["text"][:int(avg_inuit_length)]
    return article

In [51]:
truncated_american_dataset = main_amer_dataset.map(truncate_american)

Map: 100%|██████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1647.59 examples/s]


In [52]:
# check truncated total text length of american texts
truncated_american_length = sum(len(article["text"]) for article in truncated_american_dataset)

In [53]:
# check lengths match
print(f"Inuit dataset total length: {inuit_length} characters")
print(f"Truncated American dataset total length: {truncated_american_length} characters")

Inuit dataset total length: 155665 characters
Truncated American dataset total length: 155659 characters


In [None]:
# combine datasets
combined_dataset = concatenate_datasets([inuit_dataset, truncated_american_dataset])

# push each text dataset to hugging face
# this may not be needed, but is a default for sae_lens (which we ended up not using)
combined_dataset.push_to_hub("tcltcl/inuit-and-truncated-american-wikipedia")

In [None]:
truncated_american_dataset.push_to_hub("tcltcl/truncated-american-wikipedia")

In [None]:
inuit_dataset.push_to_hub("tcltcl/inuit-wikipedia")