In [None]:
import os, sys, random, re, collections, string
import numpy as np
import torch
import math
import csv
import sklearn.model_selection
import sklearn.metrics
import heapq
import matplotlib
import tqdm
import transformers

In [None]:
!pip install datasets transformers -U

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━

In [None]:
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast
# Load datasets
boolq_dataset = load_dataset('google/boolq')
emo_dataset = load_dataset('Blablablab/SOCKET', 'emobank#valence')

# Initialize the tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained('distilbert/distilgpt2')

# Optionally add special tokens
tokenizer.add_special_tokens({'additional_special_tokens': ['<s>', '</s>']})
# Ensure all models and datasets using the tokenizer know about the new special tokens

def prepare_emo_dataset(dataset):
    # This function directly returns the dataset assuming 'text' is correctly set
    return dataset

def tokenize_and_add_special_tokens(dataset, text_field_name, is_boolq=False):
    def tokenize_function(examples):
        # Adjust text composition based on the dataset type
        if is_boolq:
            texts = ["<s> " + examples['passage'][i] + " " + examples['question'][i] + "? " + ("yes" if examples['answer'][i] else "no") + " </s>" for i in range(len(examples['passage']))]
        else:
            texts = ["<s> " + txt + " </s>" for txt in examples[text_field_name]]

        # Tokenize texts
        tokens = [tokenizer.tokenize(text) for text in texts]
        return {'tokens': tokens}

    # Apply tokenization
    return dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Apply the tokenization and special token addition
tokenized_boolq = tokenize_and_add_special_tokens(boolq_dataset['train'], 'text', is_boolq=True)
tokenized_emo = tokenize_and_add_special_tokens(emo_dataset['train'], 'text')

# Print tokens for the first and last examples in each dataset
print("BoolQ Dataset - First Row Tokens:")
print(tokenized_boolq['tokens'][0])
print("BoolQ Dataset - Last Row Tokens:")
print(tokenized_boolq['tokens'][-1])

print("EmoBank Dataset - First Row Tokens:")
print(tokenized_emo['tokens'][0])
print("EmoBank Dataset - Last Row Tokens:")
print(tokenized_emo['tokens'][-1])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/16.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/804k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

9002
9002


Generating test split: 0 examples [00:00, ? examples/s]

550
550


Generating validation split: 0 examples [00:00, ? examples/s]

510
510


Generating sockette split: 0 examples [00:00, ? examples/s]

550
550


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/9002 [00:00<?, ? examples/s]

BoolQ Dataset - First Row Tokens:
['<s>', 'ĠPersian', 'Ġ(/', 'ËĪ', 'p', 'É', 'ľ', 'Ë', 'Ĳ', 'r', 'Ê', 'Ĵ', 'É', 'Ļ', 'n', ',', 'Ġ-', 'Ê', 'ĥ', 'É', 'Ļ', 'n', '/', '),', 'Ġalso', 'Ġknown', 'Ġby', 'Ġits', 'Ġend', 'onym', 'ĠF', 'ars', 'i', 'Ġ(', 'Ù', 'ģ', 'Ø§', 'Ø±', 'Ø³', 'Û', 'Į', 'Ġf', 'Äģ', 'rs', 'i', 'Ġ(', 'f', 'É', 'Ĵ', 'Ë', 'Ĳ', 'É', '¾', 'ËĪ', 'si', 'Ë', 'Ĳ', ')', 'Ġ(', 'Ġlisten', ')),', 'Ġis', 'Ġone', 'Ġof', 'Ġthe', 'ĠWestern', 'ĠIranian', 'Ġlanguages', 'Ġwithin', 'Ġthe', 'ĠIndo', '-', 'Iran', 'ian', 'Ġbranch', 'Ġof', 'Ġthe', 'ĠIndo', '-', 'European', 'Ġlanguage', 'Ġfamily', '.', 'ĠIt', 'Ġis', 'Ġprimarily', 'Ġspoken', 'Ġin', 'ĠIran', ',', 'ĠAfghanistan', 'Ġ(', 'offic', 'ially', 'Ġknown', 'Ġas', 'ĠD', 'ari', 'Ġsince', 'Ġ1958', '),', 'Ġand', 'ĠTaj', 'ik', 'istan', 'Ġ(', 'offic', 'ially', 'Ġknown', 'Ġas', 'ĠTaj', 'iki', 'Ġsince', 'Ġthe', 'ĠSoviet', 'Ġera', '),', 'Ġand', 'Ġsome', 'Ġother', 'Ġregions', 'Ġwhich', 'Ġhistorically', 'Ġwere', 'ĠPersian', 'ate', 'Ġsocieties', 'Ġand', 'Ġcons

In [None]:
from collections import defaultdict

class TrigramLM:
    def __init__(self, tokenizer_vocab_list):
        self.unigram_counts = defaultdict(int)
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.trigram_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        self.vocab = set(tokenizer_vocab_list + ['<s>', '</s>', 'OOV'])
        self.total_unigrams = 0
        self.vocab_size = len(self.vocab)

    def train(self, datasets):
        for dataset in datasets:
            for tokens in dataset:
                tokens = ['<s>'] + tokens + ['</s>']
                self.total_unigrams += len(tokens)  # Update total unigram count

                for i, token in enumerate(tokens):
                    if token not in self.vocab:
                        tokens[i] = 'OOV'  # Replace out-of-vocabulary tokens
                    self.unigram_counts[tokens[i]] += 1

                    if i > 0:
                        self.bigram_counts[tokens[i-1]][tokens[i]] += 1
                    if i > 1:
                        self.trigram_counts[tokens[i-2]][tokens[i-1]][tokens[i]] += 1

    def compute_probability(self, prev_prev_token, prev_token, token):
        trigram_count = self.trigram_counts[prev_prev_token][prev_token][token]
        bigram_count = self.bigram_counts[prev_prev_token][prev_token]
        unigram_count = self.unigram_counts[token]

        # Add-one smoothing for trigram and unigram probabilities
        trigram_prob = (trigram_count + 1) / (bigram_count + self.vocab_size)
        unigram_prob = (unigram_count + 1) / (self.total_unigrams + self.vocab_size)

        # Interpolation between trigram and unigram probabilities
        nextProb = (trigram_prob + unigram_prob) / 2
        return nextProb

    def nextProb(self, history_toks, next_toks):
        probabilities = {}
        history_len = len(history_toks)

        if history_len >= 2:
            history_toks = history_toks[-2:]
        elif history_len == 1:
            history_toks = ['<s>'] + history_toks
        else:
            history_toks = ['<s>', '<s>']  # No history implies start tokens

        for next_tok in next_toks:
            if next_tok not in self.vocab:
                next_tok = 'OOV'

            if history_len >= 2:
                prob = self.compute_probability(history_toks[0], history_toks[1], next_tok)
            else:
                # Directly use unigram probability if no sufficient history
                prob = (self.unigram_counts[next_tok] + 1) / (self.total_unigrams + self.vocab_size)

            probabilities[next_tok] = prob

        return probabilities


In [None]:
from transformers import AutoTokenizer

# Assuming you've already loaded your datasets and tokenized them
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

# Add special tokens and adjust the tokenizer's vocabulary
tokenizer.add_special_tokens({'additional_special_tokens': ['<s>', '</s>', 'OOV']})

# Get a list of all tokenizer's vocabulary plus the special tokens
tokenizer_vocab_list = list(tokenizer.get_vocab().keys()) + ['<s>', '</s>', 'OOV']

# Initialize TrigramLM with the complete vocabulary
lm = TrigramLM(tokenizer_vocab_list)

# Prepared datasets assumed to be lists of token lists as per your structure
prepared_boolq = [['<s>'] + tokenizer.tokenize(item['question'] + ' ' + item['passage']) + ['</s>'] for item in boolq_dataset['train']]
prepared_emo = [['<s>'] + tokenizer.tokenize(item['text']) + ['</s>'] for item in emo_dataset['train']]

# Train the model with the prepared datasets
lm.train([prepared_boolq, prepared_emo])

# Define the histories and next tokens as specified in your request
histories_and_next_toks = [
    (['is', 'Ġmargin', 'Ġof', 'Ġerror', 'Ġthe', 'Ġsame', 'Ġas', 'Ġconfidence'], ['Ġinterval', 'Ġthe', 'Ġis']),
    (['Ġby', 'Ġland', 'Ġor', 'Ġby'], ['Ġsea', 'Ġwater', 'Ġcycle'])
]

# Calculate and print the probabilities for the given histories and next tokens
for history_toks, next_toks in histories_and_next_toks:
    probabilities = lm.nextProb(history_toks, next_toks)
    print(f"History: {history_toks}\nNext Tokens: {next_toks}\nProbabilities: {probabilities}\n")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

History: ['is', 'Ġmargin', 'Ġof', 'Ġerror', 'Ġthe', 'Ġsame', 'Ġas', 'Ġconfidence']
Next Tokens: ['Ġinterval', 'Ġthe', 'Ġis']
Probabilities: {'Ġinterval': 4.600337581634426e-05, 'Ġthe': 0.022370018617436035, 'Ġis': 0.005478409521556805}

History: ['Ġby', 'Ġland', 'Ġor', 'Ġby']
Next Tokens: ['Ġsea', 'Ġwater', 'Ġcycle']
Probabilities: {'Ġsea': 6.323442054600087e-05, 'Ġwater': 0.00019219240190951128, 'Ġcycle': 4.721225895367268e-05}

