In [39]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
text = "fixed some syntax errors in the connector controller"
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

In [42]:
outputs = model(**inputs)
logits = outputs.logits
important_sentences = torch.argmax(logits, dim=-1)

# For demonstration, assume the first sentence is the summary
summary = tokenizer.decode(inputs.input_ids[0][important_sentences])
print("Summary:", summary)

Summary: fixed


In [28]:
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
import numpy as np

In [29]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [30]:
# Function to get BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[0]

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Generate embeddings for a list of words
def get_word_embeddings(word_list):
    embeddings = {}
    for word in word_list:
        embeddings[word] = get_bert_embeddings(word).mean(dim=0).numpy()
    return embeddings

# Generate synonyms and related words
def generate_synonyms_and_related(target_word, word_list, top_n=5):
    word_embeddings = get_word_embeddings(word_list)
    target_embedding = get_bert_embeddings(target_word).mean(dim=0).numpy()
    
    similarities = {word: cosine_similarity(target_embedding, emb) for word, emb in word_embeddings.items()}
    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    
    return sorted_similarities[:top_n]

In [31]:
# Example usage
sentence = "She felt happy about the results."
target_word = "happy"
word_list = ["joyful", "pleased", "content", "delighted", "elated", "sad", "unhappy", "miserable", "dejected"]

In [32]:
synonyms_and_related = generate_synonyms_and_related(target_word, word_list)
print(f"Synonyms and related words for '{target_word}':")
for word, similarity in synonyms_and_related:
    print(f"{word} (similarity: {similarity:.4f})")

Synonyms and related words for 'happy':
pleased (similarity: 0.9016)
sad (similarity: 0.8965)
unhappy (similarity: 0.8582)
miserable (similarity: 0.8540)
delighted (similarity: 0.8236)


In [9]:
import nltk
from nltk.corpus import wordnet as wn

# Download WordNet data if not already downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\iohkg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\iohkg\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
def get_related_words(word):
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

In [11]:
# Example usage
target_word = "implement"
related_words = get_related_words(target_word)
print(f"Related words for '{target_word}': {related_words}")

Related words for 'implement': ['follow_through', 'carry_out', 'follow_up', 'implement', 'put_through', 'follow_out', 'go_through', 'apply', 'enforce']


In [61]:
import gensim
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [59]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\iohkg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\iohkg\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [38]:
# Load GloVe model
def load_glove_model(glove_file):
    with open(glove_file, 'r', encoding='utf-8') as f:
        model = {}
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            model[word] = embedding
    return model

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

# Function to get related words using GloVe embeddings
def get_related_words_glove(word, model, top_n=5):
    if word not in model:
        return f"The word '{word}' is not in the vocabulary."
    
    word_embedding = model[word]
    similarities = {}
    
    for other_word, other_embedding in model.items():
        if other_word != word:
            similarity = cosine_similarity(word_embedding, other_embedding)
            similarities[other_word] = similarity
    
    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    
    return sorted_similarities[:top_n]

In [3]:
# Path to the GloVe file
glove_file = '../../../../../glove/glove.6B.300d.txt'

# Load the GloVe model
glove_model = load_glove_model(glove_file)

In [35]:
# Example sentence
sentence = "fixed some syntax errors in the connector controller"

# Example usage
tokens = word_tokenize(sentence.lower())

# Get related words for each token
related_words_dict = {}
for token in tokens:
    related_words = get_related_words_glove(token, glove_model)
    related_words_dict[token] = related_words

In [36]:
tokens

['fixed', 'some', 'syntax', 'errors', 'in', 'the', 'connector', 'controller']

In [37]:
# Print related words for each token
for token, related_words in related_words_dict.items():
    print(f"Related words for '{token}':")
    for word, similarity in related_words:
        print(f"  {word} (similarity: {similarity:.4f})")

Related words for 'fixed':
  rate (similarity: 0.4819)
  adjustable (similarity: 0.4738)
  rates (similarity: 0.4600)
  value (similarity: 0.4506)
  variable (similarity: 0.4453)
  basis (similarity: 0.4304)
  minimum (similarity: 0.4297)
  specified (similarity: 0.4261)
  income (similarity: 0.4254)
  investments (similarity: 0.4169)
Related words for 'some':
  many (similarity: 0.8490)
  few (similarity: 0.8076)
  those (similarity: 0.7384)
  other (similarity: 0.7381)
  have (similarity: 0.7370)
  more (similarity: 0.7359)
  several (similarity: 0.7230)
  others (similarity: 0.7125)
  these (similarity: 0.7073)
  even (similarity: 0.7060)
Related words for 'syntax':
  semantics (similarity: 0.6332)
  phonology (similarity: 0.5579)
  codice_1 (similarity: 0.5574)
  vocabulary (similarity: 0.5521)
  xml (similarity: 0.5345)
  morphology (similarity: 0.5337)
  syntactic (similarity: 0.5309)
  parsing (similarity: 0.5277)
  punctuation (similarity: 0.5196)
  html (similarity: 0.5195)
Re

In [54]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Function to summarize text
def summarize_text(text, max_length=150, min_length=40, num_beams=4):
    # Tokenize the input text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate summary
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, num_beams=num_beams, length_penalty=2.0, early_stopping=True)
    
    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

In [56]:
# Example usage
text = "Fixed and updated some important features for the new versions"
summary = summarize_text(text, 10, 10)[11:]
print("Original Text:")
print(text)
print("\nSummary:")
print(summary)

Original Text:
Fixed and updated some important features for the new versions

Summary:
Fixed and updated


In [62]:
# Example usage
tokens = word_tokenize(summary.lower())
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token not in stop_words and token.isalnum()]

# Get related words for each token
related_words_dict = {}
for token in filtered_tokens:
    related_words = get_related_words_glove(token, glove_model)
    related_words_dict[token] = related_words

In [63]:
# Print related words for each token
for token, related_words in related_words_dict.items():
    print(f"Related words for '{token}':")
    for word, similarity in related_words:
        print(f"  {word} (similarity: {similarity:.4f})")

Related words for 'fixed':
  rate (similarity: 0.4819)
  adjustable (similarity: 0.4738)
  rates (similarity: 0.4600)
  value (similarity: 0.4506)
  variable (similarity: 0.4453)
Related words for 'updated':
  update (similarity: 0.7165)
  updating (similarity: 0.6303)
  revised (similarity: 0.5968)
  updates (similarity: 0.5768)
  version (similarity: 0.5056)


In [65]:
related_words_only = {}
for token, related_words in related_words_dict.items():
    related_words_only[token] = []
    for word, similarity in related_words:
        related_words_only[token].append(word)
print(related_words_only)

{'fixed': ['rate', 'adjustable', 'rates', 'value', 'variable'], 'updated': ['update', 'updating', 'revised', 'updates', 'version']}
