In [1]:
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import MBartForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import AlbertTokenizer, AlbertForMaskedLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from transformers import AutoModelForTokenClassification, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# give code to generate wordwise pos tags using finetuned model "likhithasapu/gcm-xlmr-pos" present on huggingface

# Load the pre-trained model
model_name = "likhithasapu/gcm-xlmr-pos"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer,aggregation_strategy="simple")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [11]:
sentence = "वह dance practice कर रही थी, now she has to speed अब."
pun_word = "अब"

# Generate wordwise POS tags
result = nlp(sentence)

# Print the word with its POS tag
print(result)
for word in result:
    if pun_word == word['word']:
        print(f"{word['word']}: {word['entity_group']} : {word['score']}")

[{'entity_group': 'PRON', 'score': 0.69416314, 'word': 'वह', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.8833633, 'word': 'dance practice', 'start': 3, 'end': 17}, {'entity_group': 'VERB', 'score': 0.99882716, 'word': 'कर रही थी', 'start': 18, 'end': 27}, {'entity_group': 'CONJ', 'score': 0.3799433, 'word': ',', 'start': 27, 'end': 28}, {'entity_group': 'ADV', 'score': 0.999087, 'word': 'now', 'start': 29, 'end': 32}, {'entity_group': 'PRON', 'score': 0.99904853, 'word': 'she', 'start': 33, 'end': 36}, {'entity_group': 'VERB', 'score': 0.9996284, 'word': 'has', 'start': 37, 'end': 40}, {'entity_group': 'PART', 'score': 0.99955493, 'word': 'to', 'start': 41, 'end': 43}, {'entity_group': 'VERB', 'score': 0.9059099, 'word': 'speed', 'start': 44, 'end': 49}, {'entity_group': 'ADV', 'score': 0.9967789, 'word': 'अब', 'start': 50, 'end': 52}, {'entity_group': 'X', 'score': 0.99468935, 'word': '.', 'start': 52, 'end': 53}]
अब: ADV : 0.9967789053916931


# Read from the input file

In [None]:
import json
import random
from tqdm import tqdm
file_name = "/home2/likhithasapu/Codemixed-Pun-Generation/pun/automated_gen/pun_4.json"

new_data = []

with open(file_name, "r") as file:
    data = json.load(file)

for row in tqdm(data, total=len(data)):
    sentences = row["Candidates"]
    pun_word = row["Pun_word"]
    pun_word_pos = row["Pun_word_pos"]
    new_sentences = []
    weights = []
    if pun_word_pos == 'X':
        new_data.append(row)
        continue
    for sentence in sentences:
        result = nlp(sentence)
        # iterate in reverse order and store the relative postion in weights
        for index, word in enumerate(reversed(result)):
            if pun_word == word['word']:
                if word['entity_group'] == pun_word_pos:
                    new_sentences.append(sentence)
                    weights.append((len(result) - index)/len(result))
                    break
    if len(new_sentences) > 0:
        row["Candidates"] = new_sentences
        # Sample random sentence based on weights
        row["Sentence_chosen"] = random.choices(new_sentences, weights=weights)[0]
    new_data.append(row)
    
with open("/home2/likhithasapu/Codemixed-Pun-Generation/pun/automated_gen/pun_4_pos.json", "w") as file:
    json_data = json.dumps(new_data, indent=4, ensure_ascii=False)
    file.write(json_data)
    

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [00:40<00:00, 12.46it/s]


In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the pre-trained model
model_name = "likhithasapu/gcm-xlmr-pos"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, is_split_into_words=True)

normal_word = "बडी"

# Test sentence
normal_sentence = "मेरे big project के लिए, मैंने एक loyal बडी की मदद ली।"

# Tokenize the sentence
normal_inputs = tokenizer(normal_sentence, return_tensors="pt", truncation=True, padding=True)

# Get the model's output
normal_outputs = model(**normal_inputs)

# Get the predicted tags
normal_predictions = torch.argmax(normal_outputs.logits, dim=-1)

# Encode the word
encoded_word = tokenizer.encode(normal_word)[1]
            
# Print the word with its POS tag
pred = 0
for token, prediction in zip(normal_inputs['input_ids'][0], normal_predictions[0]):
    if token == encoded_word:
        pred = prediction
        print(f"{tokenizer.decode([token])}: {model.config.id2label[int(prediction)]}")    

ब: NOUN


In [23]:
pun_predictions

tensor([[[8.2411e-02, 8.6734e-02, 1.0558e-01, 6.2124e-02, 1.2014e-01,
          5.2085e-02, 1.0547e-01, 4.4481e-02, 6.1834e-02, 5.5296e-02,
          2.8173e-02, 3.0145e-02, 5.0588e-02, 6.3527e-02, 5.1405e-02],
         [3.9167e-04, 5.3215e-04, 2.5284e-03, 6.4683e-04, 5.0639e-03,
          9.8368e-01, 4.3724e-03, 2.6216e-04, 4.6929e-04, 2.9409e-04,
          4.0613e-04, 2.2633e-04, 2.3247e-04, 5.5494e-04, 3.3960e-04],
         [5.1534e-05, 1.2726e-04, 1.4751e-04, 1.4069e-04, 9.9854e-01,
          6.4347e-05, 2.5797e-04, 9.6940e-05, 8.7590e-05, 5.6560e-05,
          4.2024e-05, 9.1498e-05, 1.1887e-04, 8.5397e-05, 9.3892e-05],
         [1.4340e-05, 9.1021e-05, 3.2202e-05, 9.0232e-05, 2.0917e-05,
          4.4350e-05, 1.6770e-04, 7.7392e-05, 6.8204e-05, 9.9899e-01,
          7.3494e-05, 1.2731e-04, 1.2671e-04, 4.9806e-05, 2.5499e-05],
         [1.3029e-04, 1.8718e-04, 1.5549e-03, 1.2750e-04, 9.9676e-01,
          1.2446e-04, 4.3178e-04, 7.5850e-05, 7.3311e-05, 9.5747e-05,
          3.6089

In [11]:
import torch
from transformers import XLMRobertaForMaskedLM, XLMRobertaTokenizer

# Load the model and tokenizer
model_name = "likhithasapu/gcm-xlmr-v2"
model = XLMRobertaForMaskedLM.from_pretrained(model_name)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

Probability of 'target_word' as the masked token: 5.441529538074974e-07


In [12]:

# Set the input sentence with a masked token
sentence = "The goal of life is <mask>."

# Tokenize the sentence
tokens = tokenizer.encode(sentence, add_special_tokens=True, return_tensors="pt")

# Get the index of the masked token
masked_index = torch.where(tokens == tokenizer.mask_token_id)[1]

# Generate predictions for the masked token
outputs = model(tokens)
predictions = outputs.logits[0, masked_index, :]

# Apply softmax to get probabilities
probabilities = torch.softmax(predictions, dim=-1)

# Get the probability of the target word
target_word = "यही"
target_word_id = tokenizer.encode(target_word, add_special_tokens=False)[0]
target_probability = probabilities[0, target_word_id].item()

print(f"Probability of '{target_word}' as the masked token: {target_probability}")


Probability of 'यही' as the masked token: 0.10960280895233154
