In [16]:
import pandas as pd
import transformers
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader


In [51]:
df = pd.read_csv("CSN.csv")

df_seek = df.copy()
df_seek.loc[df_seek['label'] == 2, 'label'] = 0
df_seek['label'].value_counts()
df_seek

Unnamed: 0,text,label
0,I don't know who left other than him.,0
1,I also was very depressed and cryed very very ...,1
2,I am a 6 yr survivor and have been out of trea...,0
3,I have enough energy for 16 hour periods.,0
4,I was diagnosed in October 2009 at age 53.,0
...,...,...
5002,"You are a creative person, and I can just imag...",0
5003,I thought they were supposed to eliminate the ...,0
5004,"I could use a good hug too, from someone who u...",1
5005,"You take 1 pill one hour prior to treatment, 1...",0


Here is an example code for training a text classification dataset using GPT-3 with the Hugging Face Transformers library:

In this example, a custom TextClassificationDataset is defined to hold the training data and labels. The dataset is then passed to a PyTorch DataLoader for batch processing during training. The GPT2ForSequenceClassification model is loaded from a pre-trained checkpoint, and set to train mode using the train method. The cross entropy loss function and Adam optimizer are defined for training, and the model is trained using a loop over the training data. The loop updates the model parameters using the step method of the optimizer, and the loss is printed after each epoch.

You can modify this code to use your own dataset by replacing the data and labels arrays with your own data and labels, and adjusting the training loop and loss function as needed for your specific task.

In [53]:
# Load the training data
# data = [["This is a positive text"], ["This is a negative text"]]
# labels = [1, 0]

data = []
for i in df_seek['text']:
    data.append([i])
labels = df_seek['label'].to_list()
data

[["I don't know who left other than him."],
 ['I also was very depressed and cryed very very much even tho I had a good prognosis considering.'],
 ['I am a 6 yr survivor and have been out of treatment for 5 yrs.'],
 ['I have enough energy for 16 hour periods.'],
 ['I was diagnosed in October 2009 at age 53.'],
 ['But at 7 pm tonight his left ankle started swelling.'],
 ['I just wanted to let you know I was here if you wanted to talk.'],
 ["I see him week after next so we'll see what he days, actually I don't think one can have the shot while on chemo, thoughts anyone?"],
 ['I had non-small cell lung cancer, adenocarcinoma, which was a slow moving cancer.'],
 ["I was lucky and didn't feel much pain, but those darn drains are a pain in the butt."],
 ['I thought I was dying  I could not think straight, I was weak and nauseous.'],
 ['I believe, correct me if I am wrong, that doing it yourself gives you at least a little control.'],
 ['Your attitude is amazing and, for me, very contagious, 

In [48]:

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define a custom dataset
class TextClassificationDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
        label = torch.tensor(self.labels[idx])
        return input_ids, label

# Create the dataset
dataset = TextClassificationDataset(data, labels)

# Create the data loader
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Load the model
model = GPT2ForSequenceClassification.from_pretrained("gpt2")

# Set the model to train mode
model.train()

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Train the model
for epoch in range(10):
    for input_ids, label in data_loader:
        optimizer.zero_grad()
        outputs = model(input_ids, labels=label)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
    print("Epoch:", epoch, "Loss:", loss.item())


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0 Loss: 1.4203330278396606
Epoch: 1 Loss: 0.4850365221500397
Epoch: 2 Loss: 1.6931101083755493
Epoch: 3 Loss: 0.3400486409664154
Epoch: 4 Loss: 0.6747191548347473
Epoch: 5 Loss: 0.4793832302093506
Epoch: 6 Loss: 0.5860254168510437
Epoch: 7 Loss: 0.6583848595619202
Epoch: 8 Loss: 0.4160294532775879
Epoch: 9 Loss: 0.44383230805397034


In [49]:
# Save the fine-tuned model
model.save_pretrained('./gpt-3_model')

In [None]:
# Prepare the training data
test_seek = pd.read_csv("self_eval_seek_2.csv")
test_seek = test_seek[test_seek["seeking?"].notna()]
test_seek = test_seek.loc[:, ~test_seek.columns.str.contains('^Unnamed')]
test_seek = test_seek.drop_duplicates(subset='seeker_post', keep="last")
test_seek.loc[test_seek["seeking?"]=="Seeking(subtly)", "seeking?"] = int(0)
test_seek.loc[test_seek["seeking?"]=="Not Seeking", "seeking?"] = int(0)
test_seek.loc[test_seek["seeking?"]=="Not Seeking/Maybe", "seeking?"] = int(0)
test_seek.loc[test_seek["seeking?"]=="Seeking(truly)", "seeking?"] = int(1)
test_seek["seeking?"].value_counts()

test_data = []
for i in test_seek["seeker_post"]:
    test_data.append([i])
data
test_labels = test_seek["seeking?"].to_list()

In [None]:
# Load the model from disk
model = torch.load("model.pt")
model.eval()

# Create the dataset
dataset = TextClassificationDataset(test_data, test_labels)

# Create the data loader
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Evaluate the model on a test dataset
test_loss = 0
test_acc = 0
for input_ids, label in test_loader:
    with torch.no_grad():
        outputs = model(input_ids, labels=label)
        loss = outputs[0]
        logits = outputs[1]
        test_loss += loss.item()
        probs = F.softmax(logits, dim=1)
        pred = probs.argmax(dim=1)
        test_acc += (pred == label).sum().item()

# Calculate the average loss and accuracy
test_loss /= len(test_loader)
test_acc /= len(test_loader) * input_ids.shape[0]

print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)