In [16]:
import numpy as np
import pandas as pd
import pickle

In [17]:
court = pd.read_csv("/kaggle/input/nlp-ass1-datasets/court.csv", header=None)
social = pd.read_csv("/kaggle/input/nlp-ass1-datasets/social.csv", header=None)

In [18]:
court_train, court_val = court[:80000], court[80000:]
social_train, social_val = social[:80000],  social[80000:]

In [19]:
from nltk.tokenize import WordPunctTokenizer

wpt = WordPunctTokenizer()

In [20]:
from collections import defaultdict

def build_word_freqs(corpus):
    word_freqs = defaultdict(int)
    for text in corpus:
       for tok in wpt.tokenize(str(text)):
           word_freqs[tok] += 1
    return word_freqs

In [21]:
def build_initial_token_freqs(word_freqs, max_vocab=64000):
    char_freqs = defaultdict(int)
    subwords_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        for i in range(len(word)):
            char_freqs[word[i]] += freq
            # Loop through the subwords of length at least 2
            for j in range(i + 2, len(word) + 1):
                subwords_freqs[word[i:j]] += freq
    
    # Sort subwords by frequency
    sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)

    token_freqs = list(char_freqs.items()) + sorted_subwords[: max_vocab - len(char_freqs)]
    token_freqs = {token: freq for token, freq in token_freqs}
    return token_freqs

In [22]:
def encode_word(word, model):
    best_segmentations = [{"start": 0, "score": 1}] + [
        {"start": None, "score": None} for _ in range(len(word))
    ]
    for start_idx in range(len(word)):
        # This should be properly filled by the previous steps of the loop
        best_score_at_start = best_segmentations[start_idx]["score"]
        for end_idx in range(start_idx + 1, len(word) + 1):
            token = word[start_idx:end_idx]
            if token in model and best_score_at_start is not None:
                score = model[token] + best_score_at_start
                # If we have found a better segmentation ending at end_idx, we update
                if (
                    best_segmentations[end_idx]["score"] is None
                    or best_segmentations[end_idx]["score"] > score
                ):
                    best_segmentations[end_idx] = {"start": start_idx, "score": score}

    segmentation = best_segmentations[-1]
    if segmentation["score"] is None:
        # We did not find a tokenization of the word -> unknown
        return ["<unk>"], 0

    score = segmentation["score"]
    start = segmentation["start"]
    end = len(word)
    tokens = []
    while start != 0:
        tokens.insert(0, word[start:end])
        next_start = best_segmentations[start]["start"]
        end = start
        start = next_start
    tokens.insert(0, word[start:end])
    return tokens, score

In [23]:
def compute_loss(model, word_freqs):
    loss = 0
    for word, freq in word_freqs.items():
        _, word_loss = encode_word(word, model)
        loss += freq * word_loss
    return loss

In [24]:
import copy

def compute_scores(model, word_freqs):
    scores = {}
    model_loss = compute_loss(model, word_freqs)

    i = 0
    for token in model.keys():

        if i % 100 == 0:
            print(i, "/", len(model.keys()))
        i += 1
        
        # We always keep tokens of length 1
        if len(token) == 1:
            continue
        model_without_token = dict(model)
        model_without_token.pop(token, None)
        scores[token] = compute_loss(model_without_token, word_freqs) - model_loss
    return scores

In [25]:
with open("/kaggle/input/court-full-final/tensorflow2/default/1/model_court.pkl", "rb") as f:
    model_court = pickle.load(f)

with open("/kaggle/input/social-full-final/tensorflow2/default/1/model_ social.pkl", "rb") as f:
    model_social = pickle.load(f)

In [26]:
def tokenize(text, model):
    pre_tokens = wpt.tokenize(str(text)) 
    encoded_words = [encode_word(tok, model)[0] for tok in pre_tokens]
    return sum(encoded_words, [])

In [31]:
def eval_model(model, df):
    n = 20000
    total_words = 0
    total_subwords = 0
    total_unk = 0
    
    for i in range(n):
        text = df[i]
        words = wpt.tokenize(str(text))
        subwords = tokenize(text, model)

        total_words += max(1, len(words))
        total_subwords += len(subwords)
        total_unk += sum(1 for s in subwords if s == "<unk>")

        unk = (total_unk / total_subwords) if total_subwords else 0.0
        
    return {
        "fertility_subwords_per_word": f"{total_subwords / total_words:.2f}",
        "coverage": f"{(1 - unk)*100:.2f}",
    }

In [32]:
court_val_texts = court_val[0].astype(str).tolist()
social_val_texts = social_val[0].astype(str).tolist()

print(f"model_court on court_val: {eval_model(model_court, court_val_texts)},"
      f"\n\n\nmodel_court on social_val: {eval_model(model_court, social_val_texts)},"
      f"\n\n\nmodel_social on court_val: {eval_model(model_social, court_val_texts)},"
      f"\n\n\nmodel_social on social_val: {eval_model(model_social, social_val_texts)}.")

model_court on court_val: {'fertility_subwords_per_word': '1.14', 'coverage': '100.00'},


model_court on social_val: {'fertility_subwords_per_word': '2.03', 'coverage': '96.97'},


model_social on court_val: {'fertility_subwords_per_word': '1.74', 'coverage': '100.00'},


model_social on social_val: {'fertility_subwords_per_word': '1.64', 'coverage': '98.13'}.


In [None]:
print("Origin:", "–¶–µ —Ç–µ—Å—Ç –¥–ª—è —Å—É–¥–æ–≤–æ–≥–æ —Ç–∞ —Å–æ—Ü—ñ–∞–ª—å–Ω–æ–≥–æ –¥–∞—Ç–∞—Å—É—Ç—É, (Hello new model) üÜó")
print()
print("model_court:", tokenize("–¶–µ —Ç–µ—Å—Ç –¥–ª—è —Å—É–¥–æ–≤–æ–≥–æ —Ç–∞ —Å–æ—Ü—ñ–∞–ª—å–Ω–æ–≥–æ –¥–∞—Ç–∞—Å—É—Ç—É, (Hello new model) üÜó", model_court))
print()
print("model_social:", tokenize("–¶–µ —Ç–µ—Å—Ç –¥–ª—è —Å—É–¥–æ–≤–æ–≥–æ —Ç–∞ —Å–æ—Ü—ñ–∞–ª—å–Ω–æ–≥–æ –¥–∞—Ç–∞—Å—É—Ç—É, (Hello new model) üÜó", model_social))

---