In [None]:
from datasets import load_dataset

ds = load_dataset("OpenAssistant/oasst1")

In [None]:
def filter_english(example):
    return example["lang"] == "en"  # Keep only English text

# Apply filtering to all splits
ds_en = {split: ds[split].filter(filter_english) for split in ds.keys()}

# Check the filtered dataset
print(ds_en["train"])
print(ds_en["validation"])


In [None]:
# Select only text-related fields
fields_to_keep = ["message_id", "parent_id", "text", "role"]

ds_en_clean = {split: ds_en[split].remove_columns(
    [col for col in ds_en[split].column_names if col not in fields_to_keep]
) for split in ds_en.keys()}

# Display a sample
print(ds_en_clean["train"][0:2])


In [None]:
import pandas as pd

# Convert the dataset to a list of dictionaries, then to a DataFrame
df_train = ds_en_clean["train"]
df_validation = ds_en_clean["validation"]

# Save to CSV
df_train.to_csv("OpenAssistant_English_Train.csv", index=False)
df_validation.to_csv("OpenAssistant_English_Validation.csv", index=False)

print("CSV files saved successfully!")


In [None]:
import pandas as pd

# Load the CSV files
df_train = pd.read_csv("OpenAssistant_English_Train.csv")
df_validation = pd.read_csv("OpenAssistant_English_Validation.csv")

# Inspect the data
print(df_train.head())
print(df_validation.head())

In [None]:
import re

# Extract sentence-level features
def extract_style_features(text):
    # Sentence length (number of words)
    words = text.split()
    sentence_length = len(words)

    # Punctuation usage
    question_marks = text.count("?")
    exclamation_marks = text.count("!")

    # Vocabulary richness (unique words)
    unique_words = len(set(words))
    vocabulary_richness = unique_words / len(words) if len(words) > 0 else 0

    return {
        "sentence_length": sentence_length,
        "question_marks": question_marks,
        "exclamation_marks": exclamation_marks,
        "vocabulary_richness": vocabulary_richness,
    }

# Apply to all messages
df_train["style_features"] = df_train["text"].apply(extract_style_features)

In [None]:
import networkx as nx

# Initialize a directed graph
dialogue_graph = nx.DiGraph()

# Add nodes and edges
for idx, row in df_train.iterrows():
    message_id = row["message_id"]
    parent_id = row["parent_id"]
    text = row["text"]
    role = row["role"]

    # Add node with attributes
    dialogue_graph.add_node(message_id, text=text, role=role)

    # Add edge if parent_id exists
    if parent_id:
        dialogue_graph.add_edge(parent_id, message_id)

In [None]:
import matplotlib.pyplot as plt

# Visualize a small subgraph (e.g., first 10 nodes)
subgraph = dialogue_graph.subgraph(list(dialogue_graph.nodes)[:10])
nx.draw(subgraph, with_labels=True, node_color="lightblue", edge_color="gray")
plt.show()

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [None]:

def get_bert_embeddings(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Generate BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding as the sentence representation
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0)
    print("==")
    return cls_embedding

# Apply BERT encoding to all messages
bert = pd.read_csv("combined_features.csv")
df_train["bert_embedding"] = bert["bert_embedding"]

In [None]:
import networkx as nx

# Initialize a directed graph
dialogue_graph = nx.DiGraph()

# Add nodes and edges
for idx, row in df_train.iterrows():
    message_id = row["message_id"]
    parent_id = row["parent_id"]
    text = row["text"]
    role = row["role"]

    # Add node with attributes
    dialogue_graph.add_node(message_id, text=text, role=role)

    # Add edge if parent_id exists
    if parent_id:
        dialogue_graph.add_edge(parent_id, message_id)

In [None]:
import re

# Extract sentence-level features
def extract_style_features(text):
    # Sentence length (number of words)
    words = text.split()
    sentence_length = len(words)

    # Punctuation usage
    question_marks = text.count("?")
    exclamation_marks = text.count("!")

    # Vocabulary richness (unique words)
    unique_words = len(set(words))
    vocabulary_richness = unique_words / len(words) if len(words) > 0 else 0

    return {
        "sentence_length": sentence_length,
        "question_marks": question_marks,
        "exclamation_marks": exclamation_marks,
        "vocabulary_richness": vocabulary_richness,
    }

# Apply to all messages
df_train["style_features"] = df_train["text"].apply(extract_style_features)

In [None]:
import torch

def extract_style_features(row):
    style_dict = row["style_features"]
    return torch.tensor(
        [
            style_dict["sentence_length"],
            style_dict["question_marks"],
            style_dict["exclamation_marks"],
            style_dict["vocabulary_richness"],
        ],
        dtype=torch.float,
    )

# Apply to all rows
df_train["style_features_tensor"] = df_train.apply(extract_style_features, axis=1)

def combine_features(row):
    # BERT embedding (already a tensor)
    bert_embedding = row["bert_embedding"]

    # Style features (already a tensor)
    style_features = row["style_features_tensor"]

    # Graph-based features (e.g., node degree, shortest path length)
    node_degree = dialogue_graph.degree[row["message_id"]]
    shortest_path_length = (
        nx.shortest_path_length(dialogue_graph, row["parent_id"], row["message_id"])
        if row["parent_id"]
        else 0
    )
    graph_features = torch.tensor([node_degree, shortest_path_length], dtype=torch.float)

    # Ensure all tensors are 1-dimensional
    bert_embedding = bert_embedding.squeeze()  # Remove extra dimensions if any
    style_features = style_features.squeeze()  # Remove extra dimensions if any
    graph_features = graph_features.squeeze()  # Remove extra dimensions if any

    # Concatenate all features
    combined_features = torch.cat([bert_embedding, style_features, graph_features], dim=0)
    return combined_features

# Apply to all rows
# Apply to all rows and store results in a list
combined_features_list = []
for _, row in df_train.iterrows():
    combined_features = combine_features(row)
    combined_features_list.append(combined_features)

print(combined_features_list[0])

# Assign the list to the DataFrame
# df_train["combined_features"] = combined_features_list

In [None]:
# df_train["combined_features"] = df_train.apply(combine_features, axis=1)
#
# # Convert combined features to a string representation for CSV storage
# df_train["combined_features"] = df_train["combined_features"].apply(lambda x: ",".join(map(str, x)))

# Save to CSV
df_train.to_csv("combined_features.csv", index=False)