In [18]:
!pip install datasets



In [19]:
from datasets import load_dataset

ds = load_dataset("dataset-org/dialog_re")

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['dialog', 'relation_data'],
        num_rows: 1073
    })
    test: Dataset({
        features: ['dialog', 'relation_data'],
        num_rows: 357
    })
    validation: Dataset({
        features: ['dialog', 'relation_data'],
        num_rows: 358
    })
})

In [6]:
ds["train"][0]

{'dialog': ["Speaker 1: It's been an hour and not one of my classmates has shown up! I tell you, when I actually die some people are gonna get seriously haunted!",
  'Speaker 2: There you go! Someone came!',
  "Speaker 1: Ok, ok! I'm gonna go hide! Oh, this is so exciting, my first mourner!",
  'Speaker 3: Hi, glad you could come.',
  'Speaker 2: Please, come in.',
  "Speaker 4: Hi, you're Chandler Bing, right? I'm Tom Gordon, I was in your class.",
  'Speaker 2: Oh yes, yes... let me... take your coat.',
  "Speaker 4: Thanks... uh... I'm so sorry about Ross, it's...",
  'Speaker 2: At least he died doing what he loved... watching blimps.',
  'Speaker 1: Who is he?',
  'Speaker 2: Some guy, Tom Gordon.',
  "Speaker 1: I don't remember him, but then again I touched so many lives.",
  'Speaker 3: So, did you know Ross well?',
  "Speaker 4: Oh, actually I barely knew him. Yeah, I came because I heard Chandler's news. D'you know if he's seeing anyone?",
  'Speaker 3: Yes, he is. Me.',
  'S

In [7]:
for example in ds['train']:
    # Create the document for each example
    document = ' '.join(example['dialog'])

In [8]:
document

"Speaker 1: Buon Giorno, Bella Phoebe! Speaker 2: Oh, Paolo, hi, what are you doing here? Speaker 1: Uh, Racquela tell me you massage, eh? Speaker 2: Well, Racquela's right, yeah! Speaker 2: Oh, okay, I don't know what you just said, so let's get started. Speaker 3, Speaker 4: Hey Phoebe! Speaker 5: Hi Pheebs! Speaker 6: Pheebs! Speaker 2: Fine! Speaker 7: Phoebe, what's the matter? Speaker 2: Nothing, I'm sorry, I'm just, I'm out of sorts."

In [6]:
!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
import numpy as np

def get_sentence_embedding(text):
    doc = nlp(text)
    return doc.vector  # Mean of all token vectors

# Example usage
sentence_vectors = []

for example in ds['train']:
    document = ' '.join(example['dialog'])  # Merge dialogue
    sentence_vector = get_sentence_embedding(document)  # Convert to embedding
    sentence_vectors.append(sentence_vector)

print(np.array(sentence_vectors).shape)  # Check dimensions


(1073, 300)


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Extract relation labels
relation_labels = [example['relation_data']['r'][0][0] for example in ds['train']]

# Convert relation labels to numbers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(relation_labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(sentence_vectors, y, test_size=0.2, random_state=42)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.20      0.20      0.20         5
           2       0.42      0.78      0.54        80
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         4
           6       1.00      0.50      0.67         4
           9       0.00      0.00      0.00         5
          10       0.33      0.20      0.25        15
          11       0.21      0.17      0.19        18
          12       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         9
          17       1.00      0.50      0.67         2
          18       0.00      0.00      0.00         3
          19       0.00      0.00      0.00         2
          20       0.10    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
def predict_relation(text, model, label_encoder):
    sentence_vector = get_sentence_embedding(text)  # Convert to vector
    predicted_label = model.predict([sentence_vector])[0]  # Predict label
    return label_encoder.inverse_transform([predicted_label])[0]  # Convert back to text

# Example usage
new_text = "Ross and Chandler were classmates in college."
predicted_relation = predict_relation(new_text, clf, label_encoder)
print(f"Predicted relation: {predicted_relation}")


Predicted relation: unanswerable


In [None]:
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from datasets import load_dataset
from collections import Counter
from imblearn.over_sampling import SMOTE

# Load the dataset
ds = load_dataset("dataset-org/dialog_re")

# Load spaCy model with GloVe embeddings
nlp = spacy.load("en_core_web_md")
embedding_dim = nlp("word").vector.shape[0]  # Get vector size

# Function to extract word vector for an entity
def get_entity_embedding(entity):
    doc = nlp(entity)
    vectors = [token.vector for token in doc if token.has_vector]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(embedding_dim)  # Return zero vector if no embedding found

# Function to create feature vectors
def create_feature_vector(dialog, entity1, entity2):
    sentence_vector = np.mean([token.vector for token in nlp(dialog) if token.has_vector], axis=0)

    if sentence_vector is None or np.isnan(sentence_vector).any():
        sentence_vector = np.zeros(embedding_dim)

    entity1_vector = get_entity_embedding(entity1)
    entity2_vector = get_entity_embedding(entity2)

    # Concatenate sentence embedding with entity embeddings
    feature_vector = np.concatenate((sentence_vector, entity1_vector, entity2_vector))
    return feature_vector

# Prepare data for training
X, y = [], []

for example in ds["train"]:
    dialog = " ".join(example["dialog"])  # Combine dialog lines into one text
    for i in range(len(example["relation_data"]["x"])):
        entity1 = example["relation_data"]["x"][i]
        entity2 = example["relation_data"]["y"][i]
        relation = example["relation_data"]["r"][i][0]  # Take first relation (if multiple exist)

        feature_vector = create_feature_vector(dialog, entity1, entity2)
        X.append(feature_vector)
        y.append(relation)

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

# Filter out classes with fewer than 2 samples
class_counts = Counter(y)
valid_classes = {cls for cls, count in class_counts.items() if count > 1}
X_filtered = [X[i] for i in range(len(y)) if y[i] in valid_classes]
y_filtered = [y[i] for i in range(len(y)) if y[i] in valid_classes]
X = np.array(X_filtered)
y = np.array(y_filtered)

# Adjust k_neighbors for SMOTE
min_samples = min(Counter(y).values())
target_k_neighbors = max(1, min(min_samples - 1, 5))
smote = SMOTE(random_state=42, k_neighbors=target_k_neighbors)

# Apply SMOTE only if we have enough samples
if min_samples > 1:
    X, y = smote.fit_resample(X, y)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train an SVM model with class weighting
clf = SVC(kernel='linear', class_weight='balanced')
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

# Inference function
def predict_relation(dialog, entity1, entity2):
    vec = create_feature_vector(dialog, entity1, entity2).reshape(1, -1)
    return clf.predict(vec)[0]

# Example usage
test_dialog = "Speaker 1: Hey, do you know Chandler Bing? Speaker 2: Oh yes, he is my classmate."
print("Predicted Relation:", predict_relation(test_dialog, "Chandler Bing", "classmate"))

In [None]:
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from datasets import load_dataset
from collections import Counter
from imblearn.over_sampling import SMOTE

# Load the dataset containing dialogues and relation annotations
ds = load_dataset("dataset-org/dialog_re")

# Load the pre-trained spaCy model with GloVe word embeddings
nlp = spacy.load("en_core_web_md")
embedding_dim = nlp("word").vector.shape[0]  # Get the dimensionality of word embeddings

# Function to extract the word vector representation for an entity
def get_entity_embedding(entity):
    doc = nlp(entity)  # Process entity with spaCy
    vectors = [token.vector for token in doc if token.has_vector]  # Extract word vectors if available
    if vectors:
        return np.mean(vectors, axis=0)  # Return the average vector of the entity
    else:
        return np.zeros(embedding_dim)  # Return a zero vector if no embedding is found

# Function to create a feature vector for a given dialogue and entity pair
def create_feature_vector(dialog, entity1, entity2):
    sentence_vector = np.mean([token.vector for token in nlp(dialog) if token.has_vector], axis=0)  # Compute sentence embedding

    # Handle cases where the sentence embedding is empty
    if sentence_vector is None or np.isnan(sentence_vector).any():
        sentence_vector = np.zeros(embedding_dim)

    entity1_vector = get_entity_embedding(entity1)  # Get embedding for entity 1
    entity2_vector = get_entity_embedding(entity2)  # Get embedding for entity 2

    # Concatenate the sentence embedding with both entity embeddings to form the feature vector
    feature_vector = np.concatenate((sentence_vector, entity1_vector, entity2_vector))
    return feature_vector

# Prepare the dataset for training
X, y = [], []

for example in ds["train"]:
    dialog = " ".join(example["dialog"])  # Combine all dialogue lines into a single text
    for i in range(len(example["relation_data"]["x"])):
        entity1 = example["relation_data"]["x"][i]  # Extract the first entity
        entity2 = example["relation_data"]["y"][i]  # Extract the second entity
        relation = example["relation_data"]["r"][i][0]  # Extract the first relation label (if multiple exist)

        feature_vector = create_feature_vector(dialog, entity1, entity2)  # Generate feature vector
        X.append(feature_vector)  # Append feature vector to training data
        y.append(relation)  # Append relation label to target labels

# Convert feature lists to NumPy arrays for efficient processing
X = np.array(X)
y = np.array(y)

# Filter out classes that have fewer than 2 samples to avoid train-test split issues
class_counts = Counter(y)
valid_classes = {cls for cls, count in class_counts.items() if count > 1}
X_filtered = [X[i] for i in range(len(y)) if y[i] in valid_classes]
y_filtered = [y[i] for i in range(len(y)) if y[i] in valid_classes]
X = np.array(X_filtered)
y = np.array(y_filtered)

# Determine the smallest class size to adjust k_neighbors in SMOTE
min_samples = min(Counter(y).values())
target_k_neighbors = max(1, min(min_samples - 1, 5))  # Ensure k_neighbors is valid
smote = SMOTE(random_state=42, k_neighbors=target_k_neighbors)

# Apply SMOTE only if there are enough samples to generate synthetic data
if min_samples > 1:
    X, y = smote.fit_resample(X, y)

# Split data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train a Support Vector Machine (SVM) model with class weighting to handle imbalanced data
clf = SVC(kernel='linear', class_weight='balanced')
clf.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

# Function to predict the relation between two entities given a dialogue
def predict_relation(dialog, entity1, entity2):
    vec = create_feature_vector(dialog, entity1, entity2).reshape(1, -1)  # Convert input into a feature vector
    return clf.predict(vec)[0]  # Return predicted relation

# Example usage
test_dialog = "Speaker 1: Hey, do you know Chandler Bing? Speaker 2: Oh yes, he is my classmate."
print("Predicted Relation:", predict_relation(test_dialog, "Chandler Bing", "classmate"))

In [5]:
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from datasets import load_dataset

# Load the dataset containing dialogues and their relation annotations
ds = load_dataset("dataset-org/dialog_re")

# Load the pre-trained spaCy model with GloVe word embeddings
nlp = spacy.load("en_core_web_md")

# Function to extract the word vector representation for an entity
def get_entity_embedding(entity):
    doc = nlp(entity)  # Process entity with spaCy
    vectors = [token.vector for token in doc if token.has_vector]  # Extract word vectors if available
    if vectors:
        return np.mean(vectors, axis=0)  # Return the average vector of the entity
    else:
        return np.zeros(nlp("word").vector.shape[0])  # Return a zero vector if no embedding is found

# Function to create a feature vector for a given dialogue and entity pair
def create_feature_vector(dialog, entity1, entity2):
    sentence_vector = np.mean([token.vector for token in nlp(dialog) if token.has_vector], axis=0)  # Compute sentence embedding

    # Handle cases where the sentence embedding is empty
    if sentence_vector is None or np.isnan(sentence_vector).any():
        sentence_vector = np.zeros(nlp("word").vector.shape[0])

    entity1_vector = get_entity_embedding(entity1)  # Get embedding for entity 1
    entity2_vector = get_entity_embedding(entity2)  # Get embedding for entity 2

    # Concatenate the sentence embedding with both entity embeddings to form the feature vector
    feature_vector = np.concatenate((sentence_vector, entity1_vector, entity2_vector))
    return feature_vector

# Prepare the dataset for training
X, y = [], []

for example in ds["train"]:
    dialog = " ".join(example["dialog"])  # Combine all dialogue lines into a single text
    for i in range(len(example["relation_data"]["x"])):
        entity1 = example["relation_data"]["x"][i]  # Extract the first entity
        entity2 = example["relation_data"]["y"][i]  # Extract the second entity
        relation = example["relation_data"]["r"][i][0]  # Extract the first relation label (if multiple exist)

        feature_vector = create_feature_vector(dialog, entity1, entity2)  # Generate feature vector
        X.append(feature_vector)  # Append feature vector to training data
        y.append(relation)  # Append relation label to target labels

# Convert feature lists to NumPy arrays for efficient processing
X = np.array(X)
y = np.array(y)

# Split data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train a Support Vector Machine (SVM) model with class weighting to handle imbalanced data
clf = SVC(kernel='linear', class_weight='balanced')
clf.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

# Function to predict the relation between two entities given a dialogue
def predict_relation(dialog, entity1, entity2):
    vec = create_feature_vector(dialog, entity1, entity2).reshape(1, -1)  # Convert input into a feature vector
    return clf.predict(vec)[0]  # Return predicted relation

# Example usage
test_dialog = "Speaker 1: Hey, do you know Chandler Bing? Speaker 2: Oh yes, he is my classmate."
print("Predicted Relation:", predict_relation(test_dialog, "Chandler Bing", "classmate"))


OSError: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [20]:
!wget http://nlp.stanford.edu/data/glove.6B.zip


--2025-02-26 13:49:13--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-02-26 13:49:13--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-02-26 13:49:14--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’

gl

In [9]:
!unzip glove.6B.zip


Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [12]:
import numpy as np
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, "r") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:])
            embeddings_dict[word] = vector
    return embeddings_dict

glove_path = "glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_path)

print("'hello' embedding: ", glove_embeddings.get("hello"))


'hello' embedding:  ['0.26688' '0.39632' '0.6169' '-0.77451' '-0.1039' '0.26697' '0.2788'
 '0.30992' '0.0054685' '-0.085256' '0.73602' '-0.098432' '0.5479'
 '-0.030305' '0.33479' '0.14094' '-0.0070003' '0.32569' '0.22902'
 '0.46557' '-0.19531' '0.37491' '-0.7139' '-0.51775' '0.77039' '1.0881'
 '-0.66011' '-0.16234' '0.9119' '0.21046' '0.047494' '1.0019' '1.1133'
 '0.70094' '-0.08696' '0.47571' '0.1636' '-0.44469' '0.4469' '-0.93817'
 '0.013101' '0.085964' '-0.67456' '0.49662' '-0.037827' '-0.11038'
 '-0.28612' '0.074606' '-0.31527' '-0.093774' '-0.57069' '0.66865'
 '0.45307' '-0.34154' '-0.7166' '-0.75273' '0.075212' '0.57903' '-0.1191'
 '-0.11379' '-0.10026' '0.71341' '-1.1574' '-0.74026' '0.40452' '0.18023'
 '0.21449' '0.37638' '0.11239' '-0.53639' '-0.025092' '0.31886' '-0.25013'
 '-0.63283' '-0.011843' '1.377' '0.86013' '0.20476' '-0.36815' '-0.68874'
 '0.53512' '-0.46556' '0.27389' '0.4118' '-0.854' '-0.046288' '0.11304'
 '-0.27326' '0.15636' '-0.20334' '0.53586' '0.59784' '0.6046

In [7]:
# Load the dataset containing dialogues and their relation annotations
ds = load_dataset("dataset-org/dialog_re")


# Load the pre-trained spaCy model with GloVe word embeddings
nlp = spacy.load("glove_vectors")

OSError: [E050] Can't find model 'glove_vectors'. It doesn't seem to be a Python package or a valid path to a data directory.

In [13]:
import numpy as np
import spacy
from datasets import load_dataset

# Load the dataset
ds = load_dataset("dataset-org/dialog_re")

# Load spaCy tokenizer (without built-in word vectors)
nlp = spacy.blank("en")

# Function to load GloVe embeddings from file
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]  # Extract word
            vector = np.asarray(values[1:], dtype=np.float32)  # Convert to numpy array
            embeddings_dict[word] = vector
    return embeddings_dict

# Load 100d GloVe embeddings
glove_path = "glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_path)

# Get embedding size
embedding_dim = len(next(iter(glove_embeddings.values())))

# Function to get GloVe embedding for a token
def get_word_embedding(token):
    return glove_embeddings.get(token.lower(), np.zeros(embedding_dim))  # Return zero vector if not found

# Function to tokenize and apply embeddings
def process_dialog(dialog):
    tokenized_dialog = []  # Store tokenized words
    word_embeddings = []  # Store corresponding word embeddings

    for line in dialog:
        doc = nlp(line)  # Tokenize the line
        for token in doc:
            tokenized_dialog.append(token.text)  # Store tokens
            word_embeddings.append(get_word_embedding(token.text))  # Store embeddings

    return tokenized_dialog, np.array(word_embeddings)

# Select a sample dialog
sample_dialog = ds["train"][0]["dialog"]

# Tokenize and obtain word embeddings
tokens, embeddings = process_dialog(sample_dialog)

# Display results
print("Tokens:", tokens[:10])  # Show first 10 tokens
print("Word Embedding Shape:", embeddings.shape)  # Shape should be (num_tokens, 100)

# Function to create contextualized embeddings (sentence-level)
def get_contextualized_embedding(embeddings, method="average"):
    if method == "average":
        return np.mean(embeddings, axis=0)  # Compute mean across tokens
    elif method == "concat":
        return np.concatenate(embeddings, axis=None)  # Concatenate token embeddings
    else:
        raise ValueError("Unsupported method: choose 'average' or 'concat'.")

# Compute a contextualized representation of the dialog
contextualized_embedding = get_contextualized_embedding(embeddings, method="average")

print("Contextualized Embedding Shape:", contextualized_embedding.shape)


Tokens: ['Speaker', '1', ':', 'It', "'s", 'been', 'an', 'hour', 'and', 'not']
Word Embedding Shape: (309, 100)
Contextualized Embedding Shape: (100,)


In [17]:
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from datasets import load_dataset
from collections import Counter

# Load dataset
ds = load_dataset("dataset-org/dialog_re")

# Load spaCy tokenizer (without built-in word vectors)
nlp = spacy.blank("en")

# Function to load GloVe embeddings from file
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]  # Extract word
            vector = np.asarray(values[1:], dtype=np.float32)  # Convert to numpy array
            embeddings_dict[word] = vector
    return embeddings_dict

# Load GloVe embeddings
glove_path = "glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_path)

# Get embedding size
embedding_dim = len(next(iter(glove_embeddings.values())))

# Function to get GloVe embedding for a token
def get_word_embedding(token):
    return glove_embeddings.get(token.lower(), np.zeros(embedding_dim))  # Return zero vector if not found

# Function to process dialogue and get average embeddings
def process_relation_data(dialog, entities, relations):
    relation_embeddings = []  # Store feature vectors
    relation_labels = []  # Store relation labels

    for i in range(len(entities["x"])):
        entity1 = entities["x"][i]
        entity2 = entities["y"][i]
        relation = relations[i][0]  # Take first relation if multiple exist

        # Concatenate dialog into one text
        full_text = " ".join(dialog)
        doc = nlp(full_text)  # Tokenize text

        # Get embeddings for all words
        word_vectors = [get_word_embedding(token.text) for token in doc]

        if word_vectors:  # Ensure we have embeddings
            avg_embedding = np.mean(word_vectors, axis=0)  # Average word embeddings
        else:
            avg_embedding = np.zeros(embedding_dim)  # Default to zero vector if empty

        relation_embeddings.append(avg_embedding)
        relation_labels.append(relation)

    return np.array(relation_embeddings), np.array(relation_labels)

# Prepare dataset
X, y = [], []
for example in ds["train"]:
    dialog = example["dialog"]  # Get conversation text
    entities = example["relation_data"]  # Get entity pairs
    relations = example["relation_data"]["r"]  # Get relations

    X_sample, y_sample = process_relation_data(dialog, entities, relations)

    X.extend(X_sample)
    y.extend(y_sample)

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

# Count occurrences of each relation
relation_counts = Counter(y)

# Remove relations with fewer than 2 samples
valid_relations = {relation for relation, count in relation_counts.items() if count >= 2}
X_filtered = [X[i] for i in range(len(y)) if y[i] in valid_relations]
y_filtered = [y[i] for i in range(len(y)) if y[i] in valid_relations]
X = np.array(X_filtered)
y = np.array(y_filtered)

# Ensure dataset is not empty
if len(set(y)) < 2:
    raise ValueError("Not enough valid classes with at least 2 samples each after filtering.")

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train a Support Vector Machine (SVM) model
clf = SVC(kernel='linear', class_weight='balanced')
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=0))

# Inference function
def predict_relation(dialog):
    doc = nlp(dialog)  # Tokenize input text
    word_vectors = [get_word_embedding(token.text) for token in doc]

    if word_vectors:
        avg_embedding = np.mean(word_vectors, axis=0)  # Average word embeddings
    else:
        avg_embedding = np.zeros(embedding_dim)  # Default zero vector if no valid words

    return clf.predict(avg_embedding.reshape(1, -1))[0]  # Predict relation

# Example usage
test_dialog = "Speaker 1: Hey, do you know Chandler Bing? Speaker 2: Oh yes, he is my classmate."
print("Predicted Relation:", predict_relation(test_dialog))


                           precision    recall  f1-score   support

   gpe:residents_of_place       0.00      0.00      0.00        10
    gpe:visitors_of_place       0.00      0.00      0.00        10
 org:employees_or_members       0.01      0.11      0.02         9
             org:students       0.00      0.00      0.00         1
         per:acquaintance       0.05      0.50      0.09         4
                  per:age       0.06      0.09      0.07        11
      per:alternate_names       0.47      0.03      0.05       266
               per:alumni       0.08      0.37      0.13        19
                 per:boss       0.04      0.20      0.06        10
             per:children       0.12      0.06      0.08        34
               per:client       0.05      0.10      0.06        10
                per:dates       0.06      0.50      0.10         6
per:employee_or_member_of       0.00      0.00      0.00         9
              per:friends       0.19      0.06      0.10     