In [1]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Example question and context
question = "Who is Shashi?"
context = "Shashi is software engineer."

# Encode the inputs 
inputs = tokenizer(question, context, return_tensors='pt')

# Get the predicted answer
output = model(**inputs)

# Get the most likely beginning and end of answer with the argmax of the score
answer_start = torch.argmax(output.start_logits)  
answer_end = torch.argmax(output.end_logits) + 1  

# Get the answer
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

# Print the answer
print(answer)


  from .autonotebook import tqdm as notebook_tqdm


software engineer


In [None]:
train_data = [
    {
        "context": "Shashi is software engineer.",
        "qas": [
            {
                "id": "00001",
                "question": "Who is Shashi?",
                "answers": [
                    {
                        "text": "software engineer",
                        "answer_start": 12
                    }
                ]
            }
        ]
    }
]


In [1]:
from datasets import Dataset
from transformers import BertTokenizerFast
from transformers import TrainingArguments, Trainer
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Load the BERT tokenizer.
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# You have a list of dictionaries, so let's convert it to a dictionary of lists
train_data_lists = {
    "context": ["Shashi is software engineer."],
    "question": ["Who is Shashi?"],
    "answers": [[{"text": "software engineer", "answer_start": 12}]]
}

valid_data_lists = {
    "context": ["Shashi lives in New York"],
    "question": ["Where does he lives"],
    "answers": [[{"text": "New York", "answer_start": 14}]]
}

# Convert train_data to a Dataset object
train_dataset = Dataset.from_dict(train_data_lists)
valid_dataset = Dataset.from_dict(valid_data_lists)

# Define a function to tokenize our examples
def tokenize_examples(example):
    # Encode our concatenated data
    encoded = tokenizer.encode_plus(
        example["question"], 
        example["context"],
        truncation=True, 
        padding='max_length', 
        max_length=512,
    )

    # This is the tricky part, 'answer_start' is a character position. We need to find which token this position belongs to
    # We can do that with the built in char_to_token method in the tokenizer
    start_position = tokenizer(example['context'], return_offsets_mapping=True, truncation=True, max_length=512).char_to_token(example['answers'][0]['answer_start'])
    end_position = start_position + len(example['answers'][0]['text'].split()) - 1

    if start_position is None or end_position is None:
        start_position = 0
        end_position = 0

    encoded.update({'start_positions': start_position, 'end_positions': end_position})

    return encoded


# Apply the function to our train_dataset
train_dataset = train_dataset.map(tokenize_examples)
valid_dataset = valid_dataset.map(tokenize_examples)


model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Define the trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset             # evaluation dataset
)

# Train the model
trainer.train()


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 3/3 [00:35<00:00, 11.81s/it]

{'train_runtime': 35.4116, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.085, 'train_loss': 12.04428482055664, 'epoch': 3.0}





TrainOutput(global_step=3, training_loss=12.04428482055664, metrics={'train_runtime': 35.4116, 'train_samples_per_second': 0.085, 'train_steps_per_second': 0.085, 'train_loss': 12.04428482055664, 'epoch': 3.0})

In [2]:
input_ids = tokenizer.encode("Who are you?")
output = model.generate(input_ids, max_length=50, num_return_sequences=5)
output

TypeError: The current model class (BertForQuestionAnswering) is not compatible with `.generate()`, as it doesn't have a language model head. Please use one of the following classes instead: {'BertLMHeadModel'}

In [None]:
import requests
from bs4 import BeautifulSoup

# Set up authentication credentials
username = "your_username"
password = "your_password"

# Authenticate to the Confluence page
auth_url = "https://your-confluence-url.com/authenticate"
auth_payload = {
    "username": username,
    "password": password
}
auth_response = requests.post(auth_url, data=auth_payload)

# Check if authentication was successful
if auth_response.status_code == 200:
    # Access the desired page URL
    page_url = "https://your-confluence-url.com/your-page"
    page_response = requests.get(page_url)

    # Parse the HTML content
    soup = BeautifulSoup(page_response.content, "html.parser")
    data = []

    # Find and extract the relevant data from the page
    for element in soup.find_all("p"):
        sentence = element.get_text().strip()
        data.append(sentence)

    # Print the scraped data
    for sentence in data:
        print(sentence)
else:
    print("Authentication failed.")


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

# Read the text file and extract training sentences
with open("sample.txt", "r", encoding="utf-8") as file:
    train_data = file.readlines()

# Tokenization
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
encoded_data = [tokenizer.encode(text) for text in train_data]

# Pad or truncate the sequences to the same length
padded_data = torch.nn.utils.rnn.pad_sequence([torch.tensor(ids) for ids in encoded_data], batch_first=True)

print(padded_data)

# Dataset Creation
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.tensor(self.data[index])

train_dataset = MyDataset(padded_data)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Model Configuration
config = GPT2Config.from_pretrained("gpt2")
model = GPT2LMHeadModel(config)
num_epochs = 10
# Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch[:, :-1].to(device)
        target_ids = batch[:, 1:].to(device)

        outputs = model(input_ids=input_ids, labels=target_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Average Loss: {average_loss}")

# Saving the trained model
model.save_pretrained("my-trained-model")
tokenizer.save_pretrained("my-trained-model")


tensor([[   32, 47385,   318,  ...,     0,     0,     0],
        [  198,     0,     0,  ...,     0,     0,     0],
        [ 7594, 42465, 17019,  ...,     0,     0,     0],
        ...,
        [  198,     0,     0,  ...,     0,     0,     0],
        [41762,   364,   318,  ...,     0,     0,     0],
        [  198,     0,     0,  ...,     0,     0,     0]])


  return torch.tensor(self.data[index])


In [26]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("my-trained-model")
tokenizer = GPT2Tokenizer.from_pretrained("my-trained-model")

# Set the model in evaluation mode
model.eval()

# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Generate text
prompt = "Third"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

# Generate text using the model
output_ids = model.generate(input_ids, max_length=10, num_return_sequences=1)

# Decode the generated output
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Third sentence sentence sentence sentence sentence sentence sentence sentence sentence
