In [2]:
import torch
from transformers import pipeline



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Here we define a variable that will be passed to our classifier. This will check if a GPU is available, and use the CPU if one is not available.
device = "cuda" if torch.cuda.is_available() else "cpu" # if you want to use the GPU on a macbook change 'cuda' to 'mps' and make sure you have the 'accelerate' library installed.
# This line prints the device that will be used. Make sure it prints 'cuda' or 'mps' if you are trying to use a GPU.

print(f"Device: {device}")

Device: cpu


In [4]:
pipe = pipeline("zero-shot-classification", model="mlburnham/Political_DEBATE_base_v1.0", device = device, batch_size = 32) # To use the base model
#pipe = pipeline("zero-shot-classification", model='mlburnham/Political_DEBATE_large_v1.0', device = device, batch_size = 32) # To use the large model

Device set to use cpu


In [23]:
test_doc = [' now it is the next one in line for speaking to the parliament', 'everything is the fault of the immigrants', "I just love sharwarmamesteren He makes the best food"]
hypothesis_template = "Based on this text, the author's attitude towards others is best described as {}."
test_labels = ["blame", "praise", "neutral"]
result = pipe(test_doc, test_labels, hypothesis_template = hypothesis_template, multi_label = True)

print (result)
# Extract blame probability
label_scores = dict(zip(result["labels"], result["scores"]))
blame_prob = label_scores.get("blame", 0.0)

print(f"Blame probability: {blame_prob:.3f}")

[{'sequence': ' now it is the next one in line for speaking to the parliament', 'labels': ['neutral', 'blame', 'praise'], 'scores': [0.9999026656150818, 0.0004477902839425951, 6.562122962350259e-06]}, {'sequence': 'everything is the fault of the immigrants', 'labels': ['blame', 'neutral', 'praise'], 'scores': [0.9999503493309021, 5.331348347681342e-06, 2.302036136825336e-06]}, {'sequence': 'I just love sharwarmamesteren He makes the best food', 'labels': ['praise', 'blame', 'neutral'], 'scores': [0.9994699358940125, 0.0001050720748025924, 6.623339868383482e-05]}]


TypeError: list indices must be integers or slices, not str

In [19]:
# Load RDS
import pandas as pd


df = pd.read_csv("/work/MarkusLundsfrydJensen#1865/Bachelor_project/annotation_data.csv")

df.pop('Unnamed: 0')

df.head()

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country
0,1997-10-07,Dagsorden,1,Gert Petersen,,,True,191,Mødet er åbnet. I henhold til grundloven er Fo...,DK-Folketing,DNK
1,1997-10-07,Dagsorden,2,Formanden,,,True,182,"Jeg vil gerne takke Tinget for den tillid, man...",DK-Folketing,DNK
2,1997-10-07,Statsministerens redegørelse i henhold til gru...,3,Poul Nyrup Rasmussen,S,379.0,False,18662,For 25 år siden sagde et flertal i befolkninge...,DK-Folketing,DNK
3,1997-10-09,1) Indstilling fra Udvalget til Valgs Prøvelse.,2,Formanden,,,True,47,Fra Udvalget til Valgs Prøvelse har jeg modtag...,DK-Folketing,DNK
4,1997-10-09,2) Forhandling om redegørelse nr. R 1.,3,Torben Lund,S,379.0,False,2865,Vi står over for en meget afgørende folketings...,DK-Folketing,DNK


In [3]:
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-da-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate(text):
    translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True))
    return tokenizer.decode(translated[0], skip_special_tokens=True)

print(translate("Hej, hvordan har du det?"))


  from .autonotebook import tqdm as notebook_tqdm


Hey, how are you?


In [14]:
import spacy 

nlp = spacy.load("da_core_news_sm")

def split_paragraph(paragraph: str):
    doc = nlp(paragraph)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

In [15]:
def translate_paragraph(para):

    translated_sentences = ""

    sentences = split_paragraph(para)

    for sent in sentences:
        #translate sentence
        english_sentence = translate(sent)

        translated_sentences += english_sentence + " "

    return translated_sentences
        


In [27]:
paragraph = "Hej med dig, hvordan går det? Du kan tro, jeg har oplevet meget de seneste par dage. Jeg har super travlt, men ellers går det fint. Jeg var nede og handle den anden dag. Det var virkelig hårdt at bære så mange ting op i lejligheden igen. Hej med dig, hvordan går det? Du kan tro, jeg har oplevet meget de seneste par dage. Jeg har super travlt, men ellers går det fint. Jeg var nede og handle den anden dag. Det var virkelig hårdt at bære så mange ting op i lejligheden igen. Hej med dig, hvordan går det? Du kan tro, jeg har oplevet meget de seneste par dage. Jeg har super travlt, men ellers går det fint. Jeg var nede og handle den anden dag. Det var virkelig hårdt at bære så mange ting op i lejligheden igen. Hej med dig, hvordan går det? Du kan tro, jeg har oplevet meget de seneste par dage. Jeg har super travlt, men ellers går det fint. Jeg var nede og handle den anden dag. Det var virkelig hårdt at bære så mange ting op i lejligheden igen4 Hej med dig, hvordan går det? Du kan tro, jeg har oplevet meget de seneste par dage. Jeg har super travlt, men ellers går det fint. Jeg var nede og handle den anden dag. Det var virkelig hårdt at bære så mange ting op i lejligheden igen. 7."

output = translate_paragraph(paragraph)

output

"Hey, you. How's it going? You bet I've been through a lot these past few days. I'm super busy, but otherwise I'll be fine. I was down shopping the other day. It was really hard to carry so many things up the apartment again. Hey, you. How you doing? You bet I've been through a lot these past few days. I'm super busy, but otherwise I'll be fine. I was down shopping the other day. It was really hard to carry so many things up the apartment again. Hey, you. How you doing? You bet I've been through a lot these past few days. I'm super busy, but otherwise I'll be fine. I was down shopping the other day. It was really hard to carry so many things up the apartment again. Hey, you. How you doing? You bet I've been through a lot these past few days. I'm super busy, but otherwise I'll be fine. I was down shopping the other day. It was really hard to carry so many things up in the apartment again4 Hey, you. How's it going? You bet I've been through a lot these past few days. I'm super busy, but 

In [23]:
#df['translated_text'] = df['text'].apply(translate_paragraph)

import swifter

df['translated_text'] = df['text'].swifter.apply(translate_paragraph)

KeyboardInterrupt: 

In [None]:
import torch
from transformers import MarianMTModel, MarianTokenizer
import spacy
import swifter

# Load models
model_name = 'Helsinki-NLP/opus-mt-da-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
model.eval()
if torch.cuda.is_available():
    model.to('cuda')

# SpaCy for sentence splitting (optional)
nlp = spacy.load("da_core_news_sm")

def split_paragraph(paragraph: str):
    doc = nlp(paragraph)
    return [sent.text.strip() for sent in doc.sents]

def translate_batch(sentences, batch_size=16):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        tokens = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
        if torch.cuda.is_available():
            tokens = {k: v.to('cuda') for k, v in tokens.items()}
        translated = model.generate(**tokens)
        decoded = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        translations.extend(decoded)
    return translations

def translate_paragraph(para):
    sentences = split_paragraph(para)
    translated_sentences = translate_batch(sentences)
    return " ".join(translated_sentences)

# Apply to dataframe in parallel
df['translated_text'] = df['text'].swifter.apply(translate_paragraph)
