In [4]:
import pandas as pd

# Read the dataset
data = pd.read_csv('data.csv')

# Create empty lists for emotions and topics
emotions = []
topics = []

# Process each row in the dataset
for index, row in data.iterrows():
    # Extract emotions from the row
    row_emotions = [col.split('.')[2] for col, value in row.items() if value and col.startswith('Answer.f')]
    emotions.append(','.join(row_emotions))

    # Extract topics from the row
    row_topics = [col.split('.')[2] for col, value in row.items() if value and col.startswith('Answer.t')]
    topics.append(','.join(row_topics))


# Add emotions and topics as new columns
data['emotions'] = emotions
data['topics'] = topics
df = data[['Answer','emotions','topics']]
# Save the updated dataset
print(df)
df.to_csv('real.csv', index=False)

                                                 Answer              emotions  \
0     My family was the most salient part of my day,...   anxious,happy,proud   
1     Yoga keeps me focused. I am able to take some ...                  calm   
2     Yesterday, my family and I played a bunch of b...  calm,happy,satisfied   
3     Yesterday, I visited my parents and had dinner...            calm,happy   
4     Yesterday, I really felt the importance of my ...                 happy   
...                                                 ...                   ...   
1468  A workout, dining with friends makes for a hea...       happy,satisfied   
1469  My connection to God, through prayer brings me...            calm,happy   
1470  I got a skull pan for halloween, and my husban...           happy,proud   
1471  I spoke over the phone with my brother that li...                 happy   
1472  I had a lengthy but mostly fun, frivolous conv...        calm,satisfied   

        topics  
0       fa

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

# Step 1: Data Preprocessing

# Load data from CSV
data = pd.read_csv('journal_entries.csv')

journal_entries = data['entry'].tolist()
emotions = data['emotions'].apply(lambda x: x.split(',')).tolist()
topics = data['topic'].tolist()

# Tokenize and encode journal entries
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_entries = [tokenizer.encode(entry, add_special_tokens=True, max_length=128, truncation=True) for entry in journal_entries]

# Create attention masks
attention_masks = [[float(i > 0) for i in entry] for entry in tokenized_entries]

# Encode emotions and topics as numerical labels
label_map = {'anxious': 0, 'happy': 1, 'proud': 2, 'calm': 3, 'satisfied': 4, 'angry': 5, 'bored': 6, 'frustrated': 7, 'sad': 8}
encoded_emotions = [label_map[emotion] for emotions_row in emotions for emotion in emotions_row]
encoded_topics = [label_map[topic] for topic in topics]

# Convert the data to PyTorch tensors
input_ids = torch.tensor(tokenized_entries, dtype=torch.long)
attention_masks = torch.tensor(attention_masks, dtype=torch.long)
labels = torch.tensor(encoded_emotions, dtype=torch.long)

# Split the data into training and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, random_state=42, test_size=0.2
)

# Create data loaders
batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# Step 2: Model Training

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

# Define the optimizer and learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, labels = batch

        model.zero_grad()
        outputs = model(input_ids, attention_mask=input_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    # Print average training loss for the epoch
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Average training loss: {avg_train_loss}")

# Step 3: Prediction

# Evaluation on validation set
model.eval()
predictions = []
true_labels = []

for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, labels = batch

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.extend(logits)
    true_labels.extend(label_ids)

# Convert predictions and true labels to emotions
predicted_emotions = [list(label_map.keys())[np.argmax(pred)] for pred in predictions]
true_emotions = [list(label_map.keys())[label] for label in true_labels]

# Print predicted emotions for each journal entry
for entry, emotion in zip(journal_entries, predicted_emotions):
    print(f"Journal Entry: {entry}")
    print(f"Predicted Emotion: {emotion}")
    print("--------------")

  from .autonotebook import tqdm as notebook_tqdm

KeyboardInterrupt

