# data engineering

## data

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from collections import Counter
import torch
import spacy
import pandas as pd
from src.evaluation import js_divergence
# what functions are avaliable to measure?

# 1. unigram js_divergence
# 2. bpe
# 3. bigram js_divergence
# 4. gpt2 perplexity
# 5. depth of parse tree
# 6. js_divergence of POS tags
# 7. js_divergence of POS bigrams
# 8. number of named entities 




def create_gpt2_perplexity():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    device = (
        torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    )
    gpt2 = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

    def gpt2_perplexity(text):
        # Encode and prepare inputs
        inputs = tokenizer(text, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Calculate log likelihood
        with torch.no_grad():
            outputs = gpt2(**inputs, labels=inputs["input_ids"])
        log_likelihood = outputs.loss.item()

        # Calculate perplexity
        perplexity = torch.exp(torch.tensor(log_likelihood)).item()

        return perplexity
    return gpt2_perplexity


def create_bpe_tokens_per_char():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    def bpe_tokens_per_char(text):
        chars = len(text)
        tokens = len(tokenizer.encode(text))
        return tokens / chars
    return bpe_tokens_per_char


def find_depth(node):
    if not list(node.children):
        return 1
    else:
        return 1 + max(find_depth(child) for child in node.children)


def create_all_nlp_functions_de(data):
    nlp = spacy.load("de_core_news_md")
    counts_bigram_pos = Counter()
    counts_pos = Counter()
    for text in data.original_text:
        doc = nlp(text)
        pos = [token.pos_ for token in doc]
        counts_pos.update(pos)
        counts_bigram_pos.update(zip(pos, pos[1:]))

    def inner(text):
        doc = nlp(text)
        pos = [token.pos_ for token in doc]
        pos_js_divergence = js_divergence(counts_pos, Counter(pos))
        pos_bigram_js_divergence = js_divergence(counts_bigram_pos, Counter(zip(pos, pos[1:])))
        root = [token for token in doc if token.head == token][0]
        depth = find_depth(root)
        named_entities = len(doc.ents)

        
        return pos_js_divergence, pos_bigram_js_divergence, depth, named_entities
    return inner


def create_char_bigram_divergences(data):
    unigram_counts = Counter()
    bigram_counts = Counter()
    for text in data.text:
        unigram_counts.update(text)
        bigram_counts.update(zip(text, text[1:]))
    def inner(text):
        unigram_divergence = js_divergence(unigram_counts, Counter(text))
        bigram_divergence = js_divergence(bigram_counts, Counter(zip(text, text[1:])))
        return unigram_divergence, bigram_divergence
    return inner


In [3]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

data = pd.read_csv("news.2013.de.trainlen.200.nlp.csv")
# data = pd.read_csv("news.test.de.csv")
available_functions = [
    # ("gpt2_perplexity", create_gpt2_perplexity(), 'text'), # done
    # ("bpe_tokens_per_char", create_bpe_tokens_per_char(), 'text'), # done
    # ("unigram_js_divergence", create_unigram_js_divergence(data), 'text'), #done
    # ("bigram_js_divergence", create_bigram_js_divergence(data), 'text'),#17648
    # ("pos_js_divergence", create_pos_js_divergence(data),'original_text'), #17650
    # ("pos_bigram_js_divergence", create_pos_bigram_js_divergence(data), 'original_text'), #17655
    # ("depth_of_parse_tree", create_depth_of_parse_tree(), 'original_text'),#17652
    # ("named_entities", create_named_entities(),'original_text')#17649
]

# compute all nlp things at once!
# nlpfuncs= create_all_nlp_functions_de(data)
# data["pos_js_divergence"], data["pos_bigram_js_divergence"], data["depth_of_parse_tree"], data["named_entities"] = zip(*data.original_text.apply(lambda text: nlpfuncs(text)))


# compute char divergences
# char_divergences = create_char_bigram_divergences(data)
# print("fn created")
# data["unigram_js_divergence"], data["bigram_js_divergence"] = zip(*data.text.apply(lambda text: char_divergences(text)))

# compute gpt2 perplexity
gpt2_perplexity = create_gpt2_perplexity()
print("fn created")


data["gpt2_perplexity"] = data.text.progress_apply(lambda text: gpt2_perplexity(text))

# compute bpe tokens per char
# bpe_tokens_per_char = create_bpe_tokens_per_char()
# print("fn created")
# data["bpe_tokens_per_char"] = data.text.apply(lambda text: bpe_tokens_per_char(text))

# print data
# processing_now=0
# fn_name, function, src_col = available_functions[processing_now]
# print(f"Processing {fn_name}")

# function = bpe_tokens_per_char
# data[fn_name] = data[src_col].apply(lambda text: function(text))
# data["gpt2_tokens_per_char"] = data["text"].apply(lambda text: function(text))

# data.to_csv(f"news.2013.de.trainlen.200.nlp.csv", index=False)
# data.to_csv(f"news.2013.de.trainlen.200.char.csv", index=False)
data.to_csv(f"news.2013.de.trainlen.200.gpt2.csv", index=False) #
# data.to_csv(f"news.2013.de.trainlen.200.bpe.csv", index=False)

fn created


100%|██████████| 49/49 [00:12<00:00,  4.03it/s]
