In [None]:
import pandas as pd
import re
import json



# Load the JSON files
with open('train.json') as f:
    train_data = json.load(f)

with open('test.json') as f:
    test_data = json.load(f)

with open('valid.json') as f:
    val_data = json.load(f)

# Combine all data
data = {**train_data, **test_data, **val_data}

# Function to split utterances into clauses based on punctuation and conjunctions
def split_into_clauses(text):
    if not text:
        return []
    # Split based on punctuation and common conjunctions
    clauses = re.split(r'[?!.]|\b(?:and|but|or)\b', text)
    # Remove empty strings and strip whitespace
    clauses = [clause.strip() for clause in clauses if clause.strip()]
    return clauses

# Process the data
records = []
for conv_id, conv in data.items():
    for utterance in conv[0]:
        clauses = split_into_clauses(utterance.get('utterance'))
        record = {
            'conversation_id': conv_id,
            'turn': utterance.get('turn'),
            'speaker': utterance.get('speaker'),
            'utterance': utterance.get('utterance'),
            'clauses': clauses,
            'emotion': utterance.get('emotion'),
            'expanded_emotion_cause_evidence': utterance.get('expanded emotion cause evidence', []),
            'expanded_emotion_cause_span': utterance.get('expanded emotion cause span', []),
            'type': utterance.get('type', [])
        }
        records.append(record)

# Convert to DataFrame
df = pd.DataFrame(records)


# Optionally, save to CSV
df.to_csv('combined_data_with_clauses.csv', index=False)

print("Data successfully combined and saved to combined_data_with_clauses.csv")

Data successfully combined and saved to combined_data_with_clauses.csv


In [None]:
import ast
from transformers import pipeline

# Load the CSV file
file_path = '/content/combined_data_with_clauses.csv'
df = pd.read_csv(file_path)


def parse_clauses(clause_str):
    try:
        return ast.literal_eval(clause_str)
    except Exception as e:
        return []

# Apply the parsing function to the 'clauses' column and create a new column 'parsed_clauses'
df['parsed_clauses'] = df['clauses'].apply(parse_clauses)

# Load a pretrained sentiment-analysis model using Hugging Face transformers
classifier = pipeline('sentiment-analysis')

# Define a function to classify a clause based on the sentiment analysis output
def classify_clause(clause):
    try:
        # Get result from model; [0] extracts the first (and in this case only) result
        result = classifier(clause)[0]
        label = result['label']
        score = result['score']
        # Classify clause based on thresholds:
        if label == 'POSITIVE' and score > 0.5:
            return 'emotion'
        elif label == 'NEGATIVE' and score > 0.3:
            return 'cause'
        else:
            return 'neutral'
    except Exception as e:
        return 'neutral'

# Create a list to store classification results along with metadata
results = []

# Loop through each row and each clause in the parsed clause list
for idx, row in df.iterrows():
    for clause in row['parsed_clauses']:
        classification = classify_clause(clause)
        results.append({
            'conversation_id': row['conversation_id'],
            'turn': row['turn'],
            'speaker': row['speaker'],
            'clause': clause,
            'classification': classification,
            'emotion_metadata': row['emotion'],
            'utterance': row['utterance']
        })

# Convert the results into a new DataFrame
classified_df = pd.DataFrame(results)

# Save the classified data to a new CSV file
classified_df.to_csv('combined_classified.csv', index=False)

# Optionally, display the first few rows of the classified DataFrame
print(classified_df.head())

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


  conversation_id  turn speaker                                       clause  \
0         tr_4466     1       A         Hey , you wanna see a movie tomorrow   
1         tr_4466     2       B                      Sounds like a good plan   
2         tr_4466     2       B                      What do you want to see   
3         tr_4466     3       A                     How about Legally Blonde   
4         tr_4466     4       B  Ah , my girlfriend wanted to see that movie   

  classification emotion_metadata  \
0        emotion        happiness   
1        emotion        happiness   
2        emotion        happiness   
3        emotion          neutral   
4        emotion          neutral   

                                           utterance  
0             Hey , you wanna see a movie tomorrow ?  
1  Sounds like a good plan . What do you want to ...  
2  Sounds like a good plan . What do you want to ...  
3                         How about Legally Blonde .  
4  Ah , my girlfriend

In [None]:
!pip install -U sentence-transformers

!pip install torch-geometric

import pandas as pd
import ast
from sentence_transformers import SentenceTransformer
import torch
from torch_geometric.data import Data

# Read the CSV file (upload "combined_data_with_clauses-1.csv" to your Colab environment)
df = pd.read_csv('/content/combined_data_with_clauses.csv')

# Helper function to safely convert stringified lists to Python lists
def safe_literal_eval(x):
    try:
        if isinstance(x, str) and x.strip().startswith('['):
            return ast.literal_eval(x)
    except Exception as e:
        print("Error converting:", x, e)
    return x

# Process columns that are stored as string lists: clauses, expanded_emotion_cause_evidence,
# expanded_emotion_cause_span and type
df['clauses'] = df['clauses'].apply(safe_literal_eval)
df['expanded_emotion_cause_evidence'] = df['expanded_emotion_cause_evidence'].apply(safe_literal_eval)
df['expanded_emotion_cause_span'] = df['expanded_emotion_cause_span'].apply(safe_literal_eval)
df['type'] = df['type'].apply(safe_literal_eval)

# Define a mapping for the 'type' labels.
# For multiple labels (e.g., ['no-context', 'self-contagion']), we take the first one.
# Empty list will be mapped to "none".
label_mapping = {
    'no-context': 0,
    'inter-personal': 1,
    'hybrid': 2,
    'latent': 3,
    'self-contagion': 4,
    'none': 5
}

nodes = []
node_features = []
node_labels = []
node_idx = 0
# This dictionary will store node indices for each conversation (to later build sequential edges)
node_mapping = {}

# Initialize a pre-trained sentence embedding model
model_emb = SentenceTransformer('all-MiniLM-L6-v2')

# Ensure the DataFrame is sorted (by conversation and turn) so that node order is preserved.
df.sort_values(by=['conversation_id', 'turn'], inplace=True)

# Iterate over each row, and for each clause in 'clauses', create a node and compute its embedding.
for index, row in df.iterrows():
    conv_id = row['conversation_id']
    turn = row['turn']
    speaker = row['speaker']

    # Ensure clauses are in list form; if not, wrap them in a list
    clause_list = row['clauses'] if isinstance(row['clauses'], list) else [row['clauses']]

    # Get the type label from the row:
    # If the "type" column is a list and non-empty, take the first element; otherwise, use "none".
    type_list = row.get('type', [])
    if isinstance(type_list, list) and len(type_list) > 0:
        type_label = type_list[0]
    else:
        type_label = 'none'

    # Map the type to a numeric label using our mapping (if unseen, assign a new value)
    label = label_mapping.get(type_label, label_mapping['none'])

    for clause in clause_list:
        # Compute the embedding for the clause text
        emb = model_emb.encode(clause)

        # Save node details (you can later add more attributes if needed)
        nodes.append({
            'node_id': node_idx,
            'conversation_id': conv_id,
            'turn': turn,
            'speaker': speaker,
            'clause': clause
        })
        node_features.append(emb)
        node_labels.append(label)

        # Update the mapping for conversation nodes (maintain order per conversation)
        if conv_id not in node_mapping:
            node_mapping[conv_id] = []
        node_mapping[conv_id].append(node_idx)
        node_idx += 1

def extract_svo(clause, nlp):
    """
    Extract subject, verb, and object sets from a clause using spaCy.
    """
    doc = nlp(clause)
    subj, verb, obj = set(), set(), set()
    for token in doc:
        if token.dep_ in ['nsubj', 'nsubjpass']:
            subj.add(token.lemma_.lower())
        elif token.dep_ == 'ROOT' and token.pos_ == 'VERB':
            verb.add(token.lemma_.lower())
        elif token.dep_ in ['dobj', 'pobj']:
            obj.add(token.lemma_.lower())
    return subj, verb, obj


# Convert node features and labels to torch tensors
x = torch.tensor(node_features, dtype=torch.float)
y = torch.tensor(node_labels, dtype=torch.long)

# Build edge list: connect consecutive nodes within the same conversation (bidirectional)
edge_index = []
for conv_nodes in node_mapping.values():
    for i in range(len(conv_nodes) - 1):
        # Connect current node to the next node
        edge_index.append([conv_nodes[i], conv_nodes[i+1]])
        edge_index.append([conv_nodes[i+1], conv_nodes[i]])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# Create a PyTorch Geometric Data object containing the graph
data = Data(x=x, edge_index=edge_index, y=y)
print("Graph built successfully!")
print(data)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  x = torch.tensor(node_features, dtype=torch.float)


Graph built successfully!
Data(x=[21888, 384], edge_index=[2, 41564], y=[21888])


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

# Define a simple GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x



model = GraphSAGE(in_channels=data.x.size(1), hidden_channels=64, out_channels=6)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop for the GraphSAGE model
model.train()
for epoch in range(200):  # Adjust the number of epochs as needed
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.cross_entropy(out, data.y)
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        print(f'Epoch {epoch:03d}, Loss: {loss.item():.4f}')

# Evaluation of the model performance
model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    _, predicted = out.max(dim=1)
    correct = int((predicted == data.y).sum())
    acc = correct / data.num_nodes

    # Convert tensors to numpy arrays for sklearn metric calculations
    y_true = data.y.cpu().numpy()
    y_pred = predicted.cpu().numpy()
    y_prob = F.softmax(out, dim=1).cpu().numpy()

    # Calculate metrics using scikit-learn functions
    accuracy = accuracy_score(y_true, y_pred)
    try:
        auc = roc_auc_score(y_true, y_prob, multi_class='ovr')
    except Exception as e:
        auc = None
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'AUC: {auc if auc is not None else "Not available"}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

results = {
    'accuracy': accuracy,
    'auc': auc,
    'precision': precision,
    'recall': recall,
    'f1_score': f1
}



Epoch 000, Loss: 1.7990
Epoch 020, Loss: 1.1981
Epoch 040, Loss: 1.1124
Epoch 060, Loss: 1.0327
Epoch 080, Loss: 0.9391
Epoch 100, Loss: 0.8411
Epoch 120, Loss: 0.7477
Epoch 140, Loss: 0.6672
Epoch 160, Loss: 0.5946
Epoch 180, Loss: 0.5322
Accuracy: 0.8409
AUC: 0.9652833306554763
Precision: 0.8082
Recall: 0.6135
F1 Score: 0.6719
