In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:
social = pd.read_csv("/kaggle/input/nlp-ass1-datasets/social.csv", header=None)

In [None]:
social_train, social_val = social[:80000], social[80000:]

In [None]:
import re
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS) 
    u"\U0001F900-\U0001F9FF"  # supplemental symbols
    u"\U0001FA00-\U0001FAFF"  # symbols & pictographs ext
    u"\u2600-\u26FF"          # misc symbols
    u"\u2700-\u27BF"          # dingbats
                       "]+", flags = re.UNICODE)

    if not isinstance(text, str):
        return text
    text = regrex_pattern.sub("", text)

    text = text.replace("\uFE0F", "").replace("\u200D", "")
    return text

In [None]:
text_cols = social_train.select_dtypes(include=["object"]).columns
social_train[text_cols] = social_train[text_cols].applymap(deEmojify)
social_train

In [None]:
from nltk.tokenize import WordPunctTokenizer

wpt = WordPunctTokenizer()

In [None]:
from collections import defaultdict

def build_word_freqs(corpus):
    word_freqs = defaultdict(int)
    for text in corpus:
       for tok in wpt.tokenize(str(text)):
           word_freqs[tok] += 1
    return word_freqs

In [None]:
def build_initial_token_freqs(word_freqs, max_vocab=64000):
    char_freqs = defaultdict(int)
    subwords_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        for i in range(len(word)):
            char_freqs[word[i]] += freq
            # Loop through the subwords of length at least 2
            for j in range(i + 2, len(word) + 1):
                subwords_freqs[word[i:j]] += freq
    
    # Sort subwords by frequency
    sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)

    token_freqs = list(char_freqs.items()) + sorted_subwords[: max_vocab - len(char_freqs)]
    token_freqs = {token: freq for token, freq in token_freqs}
    return token_freqs

In [None]:
def encode_word(word, model):
    best_segmentations = [{"start": 0, "score": 0}] + [
        {"start": None, "score": None} for _ in range(len(word))
    ]
    for start_idx in range(len(word)):
        # This should be properly filled by the previous steps of the loop
        best_score_at_start = best_segmentations[start_idx]["score"]
        for end_idx in range(start_idx + 1, len(word) + 1):
            token = word[start_idx:end_idx]
            if token in model and best_score_at_start is not None:
                score = model[token] + best_score_at_start
                # If we have found a better segmentation ending at end_idx, we update
                if (
                    best_segmentations[end_idx]["score"] is None
                    or best_segmentations[end_idx]["score"] > score
                ):
                    best_segmentations[end_idx] = {"start": start_idx, "score": score}

    segmentation = best_segmentations[-1]
    if segmentation["score"] is None:
        # We did not find a tokenization of the word -> unknown
        return ["<unk>"], 0

    score = segmentation["score"]
    start = segmentation["start"]
    end = len(word)
    tokens = []
    while start != 0:
        tokens.insert(0, word[start:end])
        next_start = best_segmentations[start]["start"]
        end = start
        start = next_start
    tokens.insert(0, word[start:end])
    return tokens, score

In [None]:
def compute_loss(model, word_freqs):
    loss = 0
    for word, freq in word_freqs.items():
        _, word_loss = encode_word(word, model)
        loss += freq * word_loss
    return loss

In [None]:
import copy

def compute_scores(model, word_freqs):
    scores = {}
    model_loss = compute_loss(model, word_freqs)

    i = 0
    for token in model.keys():

        if i % 100 == 0:
            print(i, "/", len(model.keys()))
        i += 1
        
        # We always keep tokens of length 1
        if len(token) == 1:
            continue
        model_without_token = dict(model)
        model_without_token.pop(token, None)
        scores[token] = compute_loss(model_without_token, word_freqs) - model_loss
    return scores

In [None]:
from math import log

def unigram_model(df, init_vocab=40000, target_vocab=30000):
    word_freqs = build_word_freqs(df)
    token_freqs = build_initial_token_freqs(word_freqs, max_vocab=init_vocab)

    total_sum = sum(token_freqs.values())
    model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

    print("Start Unigram model")
    percent_to_remove = 0.2
    while len(model) > target_vocab:
        print(len(model))
        scores = compute_scores(model, word_freqs) ### <- somthing wrong
        sorted_scores = sorted(scores.items(), key=lambda x: x[1])
        # Remove percent_to_remove tokens with the lowest scores.
        for i in range(int(len(model) * percent_to_remove)):
            _ = token_freqs.pop(sorted_scores[i][0])

        total_sum = sum([freq for token, freq in token_freqs.items()])
        model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

        with open("model_social.pkl", "wb") as f:
            pickle.dump(model, f)
            print("model saved vocab size: ", len(model))
        
    return model

In [None]:
social_corpus = social_train[0].astype(str).tolist()
model_social = unigram_model(social_corpus)
print("social model size:", len(model_social))

In [None]:
with open("model_social.pkl", "wb") as f:
    pickle.dump(model_social, f)

In [None]:
print("model model_social.pkl saved")