In [1]:
# Silence warnings

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
# torchtext is deprecated, but you can use it for a basic exerice
import os
import site

# Set the SP_DIR environment variable
os.environ['SP_DIR'] = site.getsitepackages()[0]

# Verify that it has been set correctly
print("SP_DIR is set to:", os.environ['SP_DIR'])

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import AG_NEWS
from torchtext.data.functional import to_map_style_dataset



SP_DIR is set to: /fs/ddn/sdf/group/atlas/d/lapereir/miniconda3/envs/LLMs/lib/python3.9/site-packages


In [6]:
# test classifier
from tqdm import tqdm
import numpy as np
import pandas as pd
from itertools import accumulate
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from torch.utils.data import DataLoader
import numpy as np
from IPython.display import Markdown as md
from tqdm import tqdm

from torch.utils.data.dataset import random_split
from sklearn.manifold import TSNE
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package punkt to
[nltk_data]     /sdf/home/l/lapereir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /sdf/home/l/lapereir/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
# Word Based tokenizers from the Natural Language Toolkit (nlktk) library
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.util import ngrams


text = "I couldn't help the dog. Can't you do it? Don't be afraid if you are."

# word_tokenize from nltk library
tokens = word_tokenize(text)
print(text)
print(tokens)


I couldn't help the dog. Can't you do it? Don't be afraid if you are.
['I', 'could', "n't", 'help', 'the', 'dog', '.', 'Ca', "n't", 'you', 'do', 'it', '?', 'Do', "n't", 'be', 'afraid', 'if', 'you', 'are', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     /sdf/home/l/lapereir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /sdf/home/l/lapereir/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [30]:
# Word Based tokenizer from spaCy (available in multiple languages)' 
import spacy
from transformers import BertTokenizer
from transformers import XLNetTokenizer

from spacy.lang.en.examples import sentences 

text = sentences[0] # load example from spaCy

nlp = spacy.load("en_core_web_sm") # pre-trained model packages provided by spaCy for processing English text
doc = nlp(text)

# Making a list of the tokens and priting the list
token_list = [token.text for token in doc]
print(doc.text)
print(token_list)

print("Since en_core_web_sm is a pre-trained model, it provides additional info:")
print("-------------------------")
print("Token (text , pos_, dep_)")
print("-------------------------")
for token in doc:
    print(token.text, token.pos_, token.dep_)


Apple is looking at buying U.K. startup for $1 billion
['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion']
Since en_core_web_sm is a pre-trained model, it provides additional info:
-------------------------
Token (text , pos_, dep_)
-------------------------
Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN nsubj
startup VERB ccomp
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [4]:
# Sub-word based Tokenizers from HuggingFace

from transformers import BertTokenizer
from transformers import XLNetTokenizer

print(text)

#BERT is a WordPiiece tokenizer i.e. initializes its vocabulary to include every character present in the training data and progressively learns a specified number of merge rules. WordPiece doesn't select the most frequent symbol pair but rather the one that maximizes the likelihood of the training data when added to the vocabulary. In essence, WordPiece evaluates what it sacrifices by merging two symbols to ensure it's a worthwhile endeavor.

#Now, the WordPiece tokenizer is implemented in BertTokenizer. 
#Note that BertTokenizer treats composite words as separate tokens.
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print( tokenizer.tokenize(text))

#XLNetTokenizer uses Unigram and SentencePiece 
#Unigram is a method for breaking words or text into smaller pieces. It accomplishes this by starting with a large list of possibilities and gradually narrowing it down based on how frequently those pieces appear in the text. This approach aids in efficient text tokenization.
#SentencePiece is a tool that takes text, divides it into smaller, more manageable parts, assigns IDs to these segments, and ensures that it does so consistently. Consequently, if you use SentencePiece on the same text repeatedly, you will consistently obtain the same subwords and IDs.
#Unigram and SentencePiece work together by implementing Unigram's subword tokenization method within the SentencePiece framework. SentencePiece handles subword segmentation and ID assignment, while Unigram's principles guide the vocabulary reduction process to create a more efficient representation of the text data. This combination is particularly valuable for various NLP tasks in which subword tokenization can enhance the performance of language models.
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
print(tokenizer.tokenize(text))

I couldn't help the dog. Can't you do it? Don't be afraid if you are.
['i', 'couldn', "'", 't', 'help', 'the', 'dog', '.', 'can', "'", 't', 'you', 'do', 'it', '?', 'don', "'", 't', 'be', 'afraid', 'if', 'you', 'are', '.']
['▁I', '▁couldn', "'", 't', '▁help', '▁the', '▁dog', '.', '▁Can', "'", 't', '▁you', '▁do', '▁it', '?', '▁Don', "'", 't', '▁be', '▁afraid', '▁if', '▁you', '▁are', '.']


In [17]:
# Explore datasets available in HuggingFace

from huggingface_hub import list_datasets
# List all available datasets
datasets = list_datasets()
# Print some dataset names to see what's available
datasets_info = [dataset.id for dataset in datasets]  # Extract dataset identifiers

# Print some of the available datasets
print(datasets_info[:20])  

# see all at https://huggingface.co/datasets?language=language:en&sort=trending


['open-r1/OpenR1-Math-220k', 'saiyan-world/Goku-MovieGenBench', 'open-thoughts/OpenThoughts-114k', 'fka/awesome-chatgpt-prompts', 'Anthropic/EconomicIndex', 'AI-MO/NuminaMath-1.5', 'FreedomIntelligence/medical-o1-reasoning-SFT', 'simplescaling/s1K', 'open-r1/OpenR1-Math-Raw', 'GAIR/LIMO', 'HuggingFaceFW/fineweb', 'agentica-org/DeepScaleR-Preview-Dataset', 'zed-industries/zeta', 'ServiceNow-AI/R1-Distill-SFT', 'cognitivecomputations/dolphin-r1', 'CausalLM/Retrieval-SFT-Chat', 'simplescaling/s1K-1.1', 'bespokelabs/Bespoke-Stratos-17k', 'PleIAs/common_corpus', 'cais/hle']


In [27]:
# Use HuggingFace libraries

from datasets import load_dataset

# Load the a hugging face dataset
dataset = load_dataset('ag_news') # test dataset
print(dataset)

###########################
## Load your own data
###########################
# Directory containing all text files
#path_to_text_files = 'path/to/text/files/'
# Load the dataset
#dataset = load_dataset('text', data_files={'train': f'{path_to_text_files}*.txt'})

# Path to the CSV file
#path_to_data = 'path/to/your/dataset.csv'
# Load the dataset
#dataset = load_dataset('csv', data_files=path_to_data)

# Explore the dataset
print(dataset['train'][0])  # Assuming it automatically splits or you have defined the split.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


In [40]:
# Use HuggingFace libraries

# now tokenize the data
from transformers import AutoTokenizer

# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #wordpiece
#tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased') #unigram+senetencepiece
# or simply tokenize

data =  dataset['train'][0]

text =data['text']
print(text)
label = data['label']
print("\n Tokenized text: ", tokenizer.tokenize(text))
print("\n Label: ", label)

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.

 Tokenized text:  ['wall', 'st', '.', 'bears', 'claw', 'back', 'into', 'the', 'black', '(', 'reuters', ')', 'reuters', '-', 'short', '-', 'sellers', ',', 'wall', 'street', "'", 's', 'd', '##wind', '##ling', '\\', 'band', 'of', 'ultra', '-', 'cy', '##nic', '##s', ',', 'are', 'seeing', 'green', 'again', '.']

 Label:  2


In [41]:
# Load a pre-trained tokenizer
#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #wordpiece
#tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased') #unigram+senetencepiece

# Function to tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['text'])#, padding="max_length", truncation=True)

# Apply the tokenization across the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Look at the first example
print(tokenized_datasets['train'][0])


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2, 'input_ids': [101, 2813, 2358, 1012, 6468, 15020, 2067, 2046, 1996, 2304, 1006, 26665, 1007, 26665, 1011, 2460, 1011, 19041, 1010, 2813, 2395, 1005, 1055, 1040, 11101, 2989, 1032, 2316, 1997, 11087, 1011, 22330, 8713, 2015, 1010, 2024, 3773, 2665, 2153, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [45]:
# iterate over the dataset
train_iter = iter(dataset['train'])

In [74]:
# if you need a dataloader
from torch.utils.data import DataLoader

def convert_examples_to_features(example):
    # Process the text through a tokenizer if using models, or leave as is for basic handling
    return {
        'text': example['text'],
        'labels': example['label']
    }
# Create a DataLoader
test_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)


# explore dataloader in loop
for batch in test_dataloader:
    texts = batch['text']
    labels = batch['labels']
    print(f'Texts: {texts}\nLabels: {labels}\n')
    break

# or build an iterator
test_iter = iter(test_dataloader)


Texts: ['Spam Slayer: New Tools Fight Phishing Scams Swindlers combine spam with hoax sites to try to rip off your personal data.']
Labels: tensor([3])



In [76]:
example = next(test_iter)
print(example['text'], example['label'])

['Briefly: Verizon opens global phone to consumers roundup Plus: IBM, Honda team on voice-driven car navigation...Linux seller completes name change...SAP names new VP...Amazon opens floor to political pundits.'] tensor([3])
