Woorden kunnen niet aan het einde van een zin want dan denk bert dat hij de zin moet afsluiten met een punt of iets anders

In [1]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

def get_substitute_candidates(sentence, complex_word, top_k=5):
    # Load BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    
    # Tokenize the sentence
    tokenized_sentence = tokenizer(sentence, return_tensors='pt', padding=True)
    
    final_candidates= substitute_candidates_generating(tokenized_sentence,tokenizer,model,top_k)
    
    return final_candidates


def substitute_candidates_generating(tokenized_sentence,tokenizer,model,top_k):
    complex_word_indices = [i for i, token in enumerate(tokenizer.convert_ids_to_tokens(tokenized_sentence['input_ids'][0])) if token == complex_word]
    
    substitute_candidates = []
    
    for index in complex_word_indices:
        # Create a copy of the original tokenized sentence to mask different words
        masked_input_ids = tokenized_sentence['input_ids'].detach().clone()
        # Mask the complex word
        masked_input_ids[0][index] = tokenizer.mask_token_id
        
        # Predict the masked word with BERT
        with torch.no_grad():
            outputs = model(**{'input_ids': masked_input_ids})
        
        # Get predictions for the masked word
        predictions = outputs.logits[0, index].topk(top_k)
        
        # Decode the predicted token IDs to words
        for idx in predictions.indices:
            predicted_word = tokenizer.decode([idx])
            if predicted_word != complex_word and predicted_word.lower() != complex_word.lower():
                substitute_candidates.append(predicted_word)
    
    # Filter out duplicates and morphologically similar words
    final_candidates = list(set(substitute_candidates))
    return final_candidates
    
# Example usage
complex_word = 'transformers'
sentence= 'In subsequent chapters we’ll introduce many other aspects of neural models, such as transformers and the recurrent neural networks'
substitute_candidates = get_substitute_candidates(sentence, complex_word)
print("Substitute Candidates:", substitute_candidates)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Substitute Candidates: ['connectivity', 'complexity', 'networks', 'learning', 'memory']


### BERT prediction

In [44]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # You can choose a different BERT variant
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

def bert_prediction_scores(sentence, target_word, candidate_words):
    # Tokenize the input sentence
    tokenized_text = tokenizer.tokenize(sentence)
    
    # Find the position of the target word
    target_word_position = tokenized_text.index(target_word)
    
    # Create a dictionary to store candidate scores
    candidate_scores = {}
    
    for candidate_word in candidate_words:
        # Replace the target word with the candidate word
        tokenized_text[target_word_position] = candidate_word
        
        # Convert tokens to IDs
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        
        # Create input tensor
        tokens_tensor = torch.tensor([indexed_tokens])
        
        # Predict and get the probabilities for the masked token
        with torch.no_grad():
            predictions = model(tokens_tensor)
            predicted_probabilities = predictions[0][0, target_word_position]  # Probability distribution for the target position
        
        # Get the probability assigned to the candidate word
        candidate_word_id = tokenizer.convert_tokens_to_ids(candidate_word)
        candidate_probability = predicted_probabilities[candidate_word_id].item()
        
        # Store the candidate word and its score in the dictionary
        candidate_scores[candidate_word] = candidate_probability
    
    return candidate_scores

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Example usage
sentence= 'In subsequent chapters we’ll introduce many other aspects of neural models, such as transformers and the recurrent neural networks'
target_word = "transformers"
candidate_words = ['connectivity', 'complexity', 'networks', 'learning', 'memory']
scores = bert_prediction_scores(sentence, target_word, candidate_words)
print("Prediction Scores:")
for candidate, score in scores.items():
    print(f"{candidate}: {score:.4f}")

### Language model feature

In [38]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from torch.nn.functional import cross_entropy

def compute_cross_entropy_loss_for_substitution(sentence, target_word, substitution, tokenizer, model, window_size=5):
    
    # Tokenize input sentence and find the target word's index
    tokenized_sentence = tokenizer.tokenize(sentence)
    target_index = tokenized_sentence.index(target_word)
    
    # Ensure the window size does not exceed sentence bounds
    start_index = max(0, target_index - window_size)
    end_index = min(len(tokenized_sentence), target_index + window_size + 1)
    
    # Replace the target word with the substitution in the tokenized sentence
    tokenized_sentence[target_index] = substitution
    masked_sentence = tokenized_sentence[start_index:end_index]
    
    # Initialize loss accumulator and count of masked tokens
    total_loss = 0
    masked_tokens_count = 0

    # Iterate over each word in the window, mask it, predict with BERT, and calculate loss
    for i in range(len(masked_sentence)):
        # Temporarily mask the current word
        original_token = masked_sentence[i]
        masked_sentence[i] = '[MASK]'
        
        # Convert to model input format
        inputs = tokenizer.encode(' '.join(masked_sentence), return_tensors="pt")
        labels = inputs.clone()
        
        # Predict and calculate loss for the masked token
        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            prediction_scores = outputs.logits
        
        # Calculate cross-entropy loss
        loss = cross_entropy(prediction_scores.squeeze(0)[i].unsqueeze(0), labels.squeeze(0)[i].unsqueeze(0))
        total_loss += loss.item()
        
        # Restore the original token in the sentence
        masked_sentence[i] = original_token
        masked_tokens_count += 1
    
    # Compute average loss
    average_loss = total_loss / masked_tokens_count if masked_tokens_count > 0 else 0
    return average_loss

def compute_scores_for_substitutes(sentence, target_word, substitutes, tokenizer, model, window_size=5):

    scores = []

    for substitution in substitutes:
        loss = compute_cross_entropy_loss_for_substitution(sentence, target_word, substitution, tokenizer, model, window_size)
        scores.append((substitution, loss))

    return scores

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
sentence= 'In subsequent chapters we’ll introduce many other aspects of neural models, such as transformers and the recurrent neural networks'
target_word = "transformers"  # Assuming 'brown' is the complex word to be simplified
substitutes = ['connectivity', 'complexity', 'networks', 'learning', 'memory'] 

scores = compute_scores_for_substitutes(sentence, target_word, substitutes, tokenizer, model)

for substitute, score in scores:
    print(f"{substitute}:{score}")

### Semantic similarity

In [47]:
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def calculate_semantic_similarity(complex_word, candidates, fasttext_model_path):
    # Load the FastText model
    fasttext_model = fasttext.load_model(fasttext_model_path)

    # Calculate the vector representation of the complex word
    complex_word_vector = fasttext_model.get_word_vector(complex_word).reshape(1, -1)

    # Initialize an array to store the scores
    scores = []

    # Calculate similarity for each candidate
    for candidate in candidates:
        candidate_vector = fasttext_model.get_word_vector(candidate).reshape(1, -1)
        similarity = cosine_similarity(complex_word_vector, candidate_vector)[0][0]
        scores.append(similarity)

    return scores


ModuleNotFoundError: No module named 'fasttext'

In [None]:
complex_word = "transformers"
candidates = ['connectivity', 'complexity', 'networks', 'learning', 'memory']
fasttext_model_path = "path/to/your/fasttext/model.bin"

scores = calculate_semantic_similarity(complex_word, candidates, fasttext_model_path)
print(scores)

### ZipF Value

In [35]:
import os
import re
import math
from collections import Counter

# Define the folder path where the .txt files are located
folder_path = "corpora"

# Initialize a dictionary to store word frequencies
word_frequencies = Counter()

# Iterate through all .txt files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            # Tokenize the text into words using regular expressions
            words = re.findall(r'\b\w+\b', text.lower())
            # Update word frequencies
            word_frequencies.update(words)

# Calculate the total number of words in the corpus
total_words = sum(word_frequencies.values())

# Initialize a dictionary to store Zipf values
zipf_values = {}

# Calculate Zipf values for each word
for word, frequency in word_frequencies.items():
    zipf_value = math.log10(frequency / (total_words / 1e9))
    zipf_values[word] = zipf_value

# Sort the words by Zipf value in descending order
sorted_zipf_values = sorted(zipf_values.items(), key=lambda x: x[1], reverse=True)

In [51]:
def calculate_zipf_score(candidates, zipf_values):
    # Calculate Zipf scores based on Zipf values
    scores = {}
    for candidate in candidates:
        # Retrieve the Zipf value for the candidate
        zipf_value = zipf_values.get(candidate, None)
        if zipf_value is not None:
            # You can use a custom scoring formula here if needed
            score = zipf_value
            scores[candidate] = score
    
    return scores

In [37]:
candidates = 'connectivity', 'complexity', 'networks', 'learning', 'memory'  # Replace with your candidate list

# Initialize a dictionary to store candidates and their Zipf values
filtered_candidates = {}

# Compare candidates with Zipf values and filter
for candidate in candidates:
    candidate_zipf_value = zipf_values.get(candidate, None)
    if candidate_zipf_value is not None and candidate_zipf_value >= 4:
        filtered_candidates[candidate] = candidate_zipf_value

# Convert the filtered candidates dictionary to a list of tuples
filtered_candidates_list = list(filtered_candidates.items())

# Print the filtered candidates and their Zipf values
for candidate, zipf_value in filtered_candidates_list:
    print(f"Candidate: {candidate}, Zipf Value: {zipf_value:.2f}")

Candidate: complexity, Zipf Value: 4.93
Candidate: networks, Zipf Value: 5.87
Candidate: learning, Zipf Value: 6.27
Candidate: memory, Zipf Value: 5.78


In [7]:
import spacy

def extract_keyword(sentence):
    # Load the English language model for spaCy
    nlp = spacy.load("en_core_web_sm")
    
    # Process the sentence with spaCy
    doc = nlp(sentence)
    
    # Initialize variables to store the most significant keyword and its label
    keyword = None
    max_label_score = -1
    
    # Iterate over entities recognized by spaCy
    for ent in doc.ents:
        # Consider entities that are not pronouns
        if ent.label_ != "PRON":
            # Calculate a score for the entity based on its label
            label_score = len(ent.text) * (ent.label_ == "ORG")  # Example: prioritize longer organization names
            if label_score > max_label_score:
                max_label_score = label_score
                keyword = ent.text
                
    return keyword

# Test the function with the provided text
sentence = 'One of the core aspects of NLP is sentiment analysis, a technique that involves computationally identifying and categorizing opinions expressed in a piece of text. This process typically involves parsing the relevant linguistic structures and patterns, often leveraging machine learning models like convolutional neural networks (CNNs) or recurrent neural networks (RNNs) for nuanced interpretation.'
keyword = extract_keyword(sentence)
print("Keyword:", keyword)

Keyword: NLP


In [11]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

def get_top_replacements(sentence, words_to_replace, top_k=3):
    # Load pre-trained model tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')

    # Process each word to replace
    for word in words_to_replace:
        # Mask the target word
        masked_sentence = sentence.replace(word, tokenizer.mask_token)

        # Tokenize input
        input = tokenizer.encode(masked_sentence, return_tensors="pt")
        mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]

        # Predict token
        token_logits = model(input).logits
        mask_token_logits = token_logits[0, mask_token_index, :]
        all_tokens = torch.topk(mask_token_logits, k=model.config.vocab_size, dim=1).indices[0].tolist()

        # Filter based on ZIPF value
        filtered_tokens = [token for token in all_tokens]

        # Display top replacement suggestions
        print(f"Top replacements for '{word}':")
        for token in filtered_tokens[:top_k]:
            print(tokenizer.decode([token]))
        print("\n")

# Example sentence
sentence1 = 'One of the core aspects of NLP is sentiment analysis, a technique that involves computationally identifying and categorizing opinions expressed in a piece of text. This process typically involves parsing the relevant linguistic structures and patterns, often leveraging machine learning models like convolutional neural networks (CNNs) or recurrent neural networks (RNNs) for nuanced interpretation.'
sentence2=' A probabilistic classifier additionally will tell us the probability of the observation being in the class.'
sentence3 = 'In such cases we can instead derive the positive and negative word features from sentiment lexicons'
sentence4 = 'The architecture we feedforward introduce is called a feedforward network '
sentence5= 'In subsequent chapters we’ll introduce many other aspects of neural models, such as recurrent neural networks and the transformer'
# Words to replace

# Get top replacements
get_top_replacements(sentence1, ['sentiment'])
get_top_replacements(sentence2, ['probabilistic'])
get_top_replacements(sentence3, ['lexicons'])
get_top_replacements(sentence4, ['feedforward'])
get_top_replacements(sentence5, ['transformer'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Top replacements for 'sentiment':
opinion
discourse
content




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Top replacements for 'probabilistic':
class
different
particular




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Top replacements for 'lexicons':
.
;
!




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Top replacements for 'feedforward':
now
first
will




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Top replacements for 'transformer':
.
;
?




Rewrote the sentence or took the full sentence

Getting ready to further train BERT

In [2]:
import os

def load_and_split_texts(directory):
    all_sentences = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
                # Splitting text into sentences at every period
                sentences = text.split('.')
                all_sentences.extend(sentences)
    return all_sentences

# Usage
directory = 'corpora'
sentences = load_and_split_texts(directory)
print(f"Total sentences: {len(sentences)}")


Total sentences: 24078


In [3]:
from torch.utils.data import Dataset
from transformers import BertTokenizer

class SentenceDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=512):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        # Tokenize the sentence and prepare the data in the format BERT expects
        inputs = self.tokenizer.encode_plus(
            sentence, 
            add_special_tokens=True, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten()
        }


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Use your function to load sentences
sentences = load_and_split_texts('corpora')

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset
dataset = SentenceDataset(sentences, tokenizer)


In [5]:
from torch.utils.data import DataLoader
from transformers import BertForMaskedLM, AdamW

# Load pre-trained BERT model
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# DataLoader
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 20
# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch} completed.")

Downloading config.json: 100%|██████████| 286/286 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading pytorch_model.bin:  93%|█████████▎| 41.9M/45.1M [00:08<00:00, 4.58MB/s]

KeyboardInterrupt: 

Downloading pytorch_model.bin:  93%|█████████▎| 41.9M/45.1M [00:19<00:00, 4.58MB/s]