In [21]:
# This notebook explores the use of SentenceBERT and spacy to generate concept embeddings from text

from transformers import BertTokenizer, BertModel
from torch.nn import CosineSimilarity
import torch

# Load pretrained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example sentences
sentence1 = "How can I book a flight?"
sentence2 = "How can I purchase a flight? "

# Tokenize and encode the sentences
inputs1 = tokenizer(sentence1, return_tensors='pt')
inputs2 = tokenizer(sentence2, return_tensors='pt')

# Get the embeddings
with torch.no_grad():
    embeddings1 = model(**inputs1).last_hidden_state.mean(dim=1)
    embeddings2 = model(**inputs2).last_hidden_state.mean(dim=1)

# Compute cosine similarity
cos = CosineSimilarity(dim=1)
similarity = cos(embeddings1, embeddings2)

print(f"Similarity score: {similarity.item()}")




Similarity score: 0.9528940320014954


In [None]:
print(model)

In [None]:
import spacy
! python -m spacy download en_core_web_sm

def extract_concepts(sentence):
    # Load spaCy's English language model
    nlp = spacy.load('en_core_web_sm')
    
    # Process the sentence
    doc = nlp(sentence)
    
    # Extract concepts: noun chunks and optionally named entities
    concepts = [chunk.text for chunk in doc.noun_chunks]  # Extract noun chunks

    # Optionally, add named entities as concepts
    entities = [ent.text for ent in doc.ents]
    concepts.extend(entities)
    
    # Remove duplicates and return
    return list(set(concepts))

# Example usage
sentence = "Quantum computing and machine learning are transforming the field of artificial intelligence."
concepts = extract_concepts(sentence)
print(concepts)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load and preprocess data
# Assume load_data and preprocess_data are defined to load your dataset
data = load_data('dataset.csv')
X, y = preprocess_data(data)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define the model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128))  # Adjust input_dim as needed
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred > 0.5))

In [None]:
from textsplit.tools import SimpleSentenceTokenizer
from textsplit.algorithm import split_optimal

# Load the text
text = "Mars is the fourth planet from the Sun. The surface of Mars is orange-red because it is covered in iron(III) oxide dust, giving it the nickname 'the Red Planet'. Mars is among the brightest objects in Earth's sky, and its high-contrast albedo features have made it a common subject for telescope viewing. It is classified as a terrestrial planet and is the second smallest of the Solar System's planets with a diameter of 6,779 km (4,212 mi). In terms of orbital motion, a Martian solar day (sol) is equal to 24.5 hours, and a Martian solar year is equal to 1.88 Earth years (687 Earth days). Mars has two natural satellites that are small and irregular in shape: Phobos and Deimos."

# Tokenize the text into sentences
sentence_tokenizer = SimpleSentenceTokenizer()
sentences = sentence_tokenizer(text)

# Split the text into segments
splits = split_optimal(sentences, penalty=0.1)

# Print the segments
for segment in splits:
        print(" ".join(sentences[segment[0]:segment[1]]))

In [14]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("TankuVie/bert-finetuned-unpunctual-text-segmentation")
model = AutoModelForTokenClassification.from_pretrained("TankuVie/bert-finetuned-unpunctual-text-segmentation")

In [24]:
import torch

# Load the text
text = "Mars is the fourth planet from the Sun. The surface of Mars is orange-red because it is covered in iron(III) oxide dust, giving it the nickname 'the Red Planet'. Mars is among the brightest objects in Earth's sky, and its high-contrast albedo features have made it a common subject for telescope viewing. It is classified as a terrestrial planet and is the second smallest of the Solar System's planets with a diameter of 6,779 km (4,212 mi). In terms of orbital motion, a Martian solar day (sol) is equal to 24.5 hours, and a Martian solar year is equal to 1.88 Earth years (687 Earth days). Mars has two natural satellites that are small and irregular in shape: Phobos and Deimos."

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt")

inputs


{'input_ids': tensor([[  101,  7733,  2003,  1996,  2959,  4774,  2013,  1996,  3103,  1012,
          1996,  3302,  1997,  7733,  2003,  4589,  1011,  2417,  2138,  2009,
          2003,  3139,  1999,  3707,  1006,  3523,  1007, 15772,  6497,  1010,
          3228,  2009,  1996,  8367,  1005,  1996,  2417,  4774,  1005,  1012,
          7733,  2003,  2426,  1996, 26849,  5200,  1999,  3011,  1005,  1055,
          3712,  1010,  1998,  2049,  2152,  1011,  5688,  2632, 28759,  2838,
          2031,  2081,  2009,  1037,  2691,  3395,  2005, 12772, 10523,  1012,
          2009,  2003,  6219,  2004,  1037, 12350,  4774,  1998,  2003,  1996,
          2117, 10479,  1997,  1996,  5943,  2291,  1005,  1055, 11358,  2007,
          1037,  6705,  1997,  1020,  1010,  6255,  2683,  2463,  1006,  1018,
          1010, 18164,  2771,  1007,  1012,  1999,  3408,  1997, 13943,  4367,
          1010,  1037, 20795,  5943,  2154,  1006, 14017,  1007,  2003,  5020,
          2000,  2484,  1012,  1019,  

In [31]:
# Get the predicted segment labels
with torch.no_grad():
    outputs = model(**inputs)
    print(outputs)
    predicted_labels = outputs.last_hidden_state.argmax(dim=2)
    print(predicted_labels)

tokenizer.decode(predicted_labels[0])

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.0204, -0.1012,  0.2194,  ...,  0.2456,  0.6954,  1.1194],
         [ 0.3022,  1.0192,  0.4375,  ..., -0.0246,  0.9316,  0.6587],
         [-1.4419, -0.0301,  0.3008,  ...,  0.1404,  0.4928,  0.7091],
         ...,
         [-0.8128, -0.0673, -0.9113,  ..., -1.0055,  1.0571, -0.5666],
         [-0.8968, -0.8013, -0.2706,  ...,  0.6588,  0.9557, -0.3104],
         [-0.9273, -0.0457,  0.2136,  ...,  0.6011,  0.2943, -0.0431]]]), pooler_output=tensor([[-0.8164, -0.5264, -0.9291,  0.6446,  0.8406, -0.3913, -0.1817,  0.5292,
         -0.8709, -1.0000, -0.7165,  0.9693,  0.9673,  0.2042,  0.3778, -0.5459,
         -0.3514, -0.5353,  0.4212,  0.9009,  0.3931,  1.0000, -0.4791,  0.5602,
          0.4955,  0.9852, -0.7595,  0.8237,  0.9070,  0.7706, -0.1224,  0.4476,
         -0.9929, -0.1334, -0.9056, -0.9813,  0.5890, -0.4938,  0.1892, -0.2136,
         -0.7540,  0.5457,  1.0000, -0.2556,  0.6084, -0.2269, -1.0000,  0.

'[unused200] [unused584] [unused347] [unused347] [unused206] [unused501] [unused281] [unused347] [unused599] [unused145] [unused347] [unused47] [unused347] [unused276] [unused347] [unused695] [unused224] [unused347] [unused80] [unused347] [unused347] [unused347] [unused6] [unused347] [unused144] [unused80] [unused167] [unused64] [unused347] [unused751] [unused347] [unused347] [unused347] [unused573] [unused224] [unused347] [unused347] [unused634] [unused167] [unused550] [unused276] [unused347] [unused261] [unused347] [unused347] [unused46] [unused47] [unused333] [unused665] [unused347] [unused6] [unused751] [unused628] [unused347] [unused54] [unused224] [unused347] [unused405] [unused276] [unused405] [unused224] [unused347] [unused347] [unused654] [unused347] [unused747] [unused734] [unused162] [unused360] [unused550] [unused347] [unused347] [SEP] [unused374] [unused276] [unused1] [unused6] [unused628] [unused347] [unused347] [unused115] [unused500] [unused281] [unused224] [unused599] 

In [None]:
# Extract segments based on predicted labels
segments = []
segment = []
for i, label in enumerate(predicted_labels[0]):
    if label == 1:
        segments.append(segment)
        segment = []
    segment.append(tokenizer.convert_ids_to_tokens(inputs.input_ids[0][i].item()))

In [17]:
segments = [" ".join(segment) for segment in segments if segment]
print(segments)

['[CLS]', "Mars is the fourth planet from the Sun . The surface of Mars is orange - red because it is covered in iron ( III ) o ##xide dust , giving it the nickname ' the Red Planet ' . Mars is among the bright ##est objects in Earth ' s sky , and its high - contrast albedo features have made it a common subject for tele ##scope view ##ing . It is classified as a terrestrial planet and is the second smallest of the Solar System ' s planets with a diameter of 6 , 779 km ( 4 , 212 mi ) .", 'In terms of orbital motion , a Mart ##ian solar day ( sol ) is equal to 24 . 5 hours , and a Mart ##ian solar year is equal to 1 . 88 Earth years ( 687 Earth days ) .']


['[CLS]', "Mars is the fourth planet from the Sun . The surface of Mars is orange - red because it is covered in iron ( III ) o ##xide dust , giving it the nickname ' the Red Planet ' . Mars is among the bright ##est objects in Earth ' s sky , and its high - contrast albedo features have made it a common subject for tele ##scope view ##ing . It is classified as a terrestrial planet and is the second smallest of the Solar System ' s planets with a diameter of 6 , 779 km ( 4 , 212 mi ) .", 'In terms of orbital motion , a Mart ##ian solar day ( sol ) is equal to 24 . 5 hours , and a Mart ##ian solar year is equal to 1 . 88 Earth years ( 687 Earth days ) .']
