# data engineering

## data

In [2]:
import sys
import os

sys.path.append("./enigma-transformed/src")
sys.path.append("./src")
sys.path.append("../src")
sys.path.append("../../src")

In [15]:
import spacy
import spacy_udpipe
# from spacy_udpipe import  
def english_process():
    pass

def german_process():
    # load spacy medium model
    nlp = spacy.load("de_core_news_md")
    def nlpize(text):
        return nlp(text)

    return nlpize


def czech_process():
    # NER has to be done externally 
    pass
# g = german_process()

In [17]:
# spacy_udpipe.download("cs")

Downloaded pre-trained UDPipe model for 'cs' language


In [33]:

# nlp = spacy_udpipe.load("cs")
# nlp.analyze_pipes()

# nlp._components

# o = nlp("Ahoj světe, Karlův most, Brno, Paříž!")

[]

In [31]:
# for token in o:
#     print(token.text, token.lemma_, token.pos_, token.dep_, token.head.text)

Ahoj ahoj PART nmod světe
světe svět NOUN ROOT světe
, , PUNCT punct most
Karlův Karlův ADJ amod most
most most NOUN appos světe
, , PUNCT punct Brno
Brno Brno PROPN conj most
, , PUNCT punct Paříž
Paříž Paříž PROPN conj most
! ! PUNCT punct světe


In [10]:
from scipy.spatial.distance import jensenshannon
from src.evaluation import js_divergence

In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import spacy
import pandas as pd
# what functions are avaliable to measure?

# 1. unigram js_divergence
# 2. bpe
# 3. bigram js_divergence
# 4. gpt2 perplexity
# 5. depth of parse tree
# 6. js_divergence of POS tags
# 7. js_divergence of POS bigrams
# 8. number of named entities 




def create_gpt2_perplexity():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    device = (
        torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    )
    gpt2 = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

    def gpt2_perplexity(text):
        # Encode and prepare inputs
        inputs = tokenizer(text, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Calculate log likelihood
        with torch.no_grad():
            outputs = gpt2(**inputs, labels=inputs["input_ids"])
        log_likelihood = outputs.loss.item()

        # Calculate perplexity
        perplexity = torch.exp(torch.tensor(log_likelihood)).item()

        return perplexity
    return gpt2_perplexity


def create_bpe_tokens_per_char():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    def bpe_tokens_per_char(text):
        chars = len(text)
        tokens = len(tokenizer.encode(text))
        return tokens / chars
    return bpe_tokens_per_char



from collections import Counter
def create_unigram_js_divergence(data):
    # compute the typical distribution
    counts = Counter()
    for text in data.text:
        counts.update(text)   


    def unigram_js_divergence(text):
        return js_divergence(counts, Counter(text))

    return unigram_js_divergence


def create_bigram_js_divergence(data):
    counts = Counter()
    for text in data.text:
        counts.update(zip(text, text[1:]))
    def bigram_js_divergence(text):
        return js_divergence(counts, Counter(zip(text, text[1:])))
    return bigram_js_divergence


def create_pos_js_divergence(data):
    nlp = spacy.load("en_core_web_sm")
    counts = Counter()
    for text in data.original_text:
        doc = nlp(text)
        pos = [token.pos_ for token in doc]
        counts.update(pos)

    def pos_js_divergence(text):
        doc = nlp(text)
        pos = [token.pos_ for token in doc]
        return js_divergence(counts, Counter(pos))
    return pos_js_divergence


def create_pos_bigram_js_divergence(data):
    nlp = spacy.load("en_core_web_sm")
    counts = Counter()
    for text in data.original_text:
        doc = nlp(text)
        pos = [token.pos_ for token in doc]
        counts.update(zip(pos, pos[1:]))

    def pos_bigram_js_divergence(text):
        doc = nlp(text)
        pos = [token.pos_ for token in doc]
        return js_divergence(counts, Counter(zip(pos, pos[1:])))
    
    return pos_bigram_js_divergence

def create_depth_of_parse_tree(): #??? how to validate that this computes valid things, subset and compute if results differ with different spacy models
    nlp = spacy.load("en_core_web_sm")
    def depth_of_parse_tree(text):
        doc = nlp(text)
        def find_depth(node):
            if not list(node.children):
                return 1
            else:
                return 1 + max(find_depth(child) for child in node.children)

        # Finding the root of the parse tree
        root = [token for token in doc if token.head == token][0]
        return find_depth(root)

    return depth_of_parse_tree


def create_named_entities():
    nlp = spacy.load("en_core_web_sm")
    def named_entities(text):
        doc = nlp(text)
        return len(doc.ents)
    return named_entities

In [None]:
import pandas as pd

data = pd.read_csv("news.2013.en.trainlen.200")
# data = pd.read_csv("news.test.trainlen200")
available_functions = [
    # ("gpt2_perplexity", create_gpt2_perplexity(), 'text'), # done
    # ("gpt2_tokens_per_char", create_bpe_tokens_per_char(), 'text'), # done
    # ("unigram_js_divergence", create_unigram_js_divergence(data), 'text'), #done
    # ("bigram_js_divergence", create_bigram_js_divergence(data), 'text'),#17648
    # ("pos_js_divergence", create_pos_js_divergence(data),'original_text'), #17650
    ("pos_bigram_js_divergence", create_pos_bigram_js_divergence(data), 'original_text'), #17655
    # ("depth_of_parse_tree", create_depth_of_parse_tree(), 'original_text'),#17652
    # ("named_entities", create_named_entities(),'original_text')#17649
]

processing_now=0
fn_name, function, src_col = available_functions[processing_now]
print(f"Processing {fn_name}")

# function = bpe_tokens_per_char
data[fn_name] = data[src_col].apply(lambda text: function(text))
# data["gpt2_tokens_per_char"] = data["text"].apply(lambda text: function(text))

data.to_csv(f"news.2013.en.trainlen.200.{fn_name}", index=False)