In [1]:
import openai
from typing import List,Tuple
from dotenv import load_dotenv
import os
import re
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

**GPT 3.5-turbo labeller**

In [3]:
def gpt_classifier(text:str, classify_by:str, labels:List[str], api_key:str, temperature:float = 0.0) -> str:
    """
    Uses OpenAIs GPT-3.5-turbo model to classify texts. Visit https://platform.openai.com/docs/models/gpt-3-5 for full documentation 

    @param text: text you want to extract information from
    @param classify_by: keyword to extract
    @param labels: labels to choose from
    @param api_key: OpenAI API key
    @param temperature: OpenAI parameter: Higher values means the model will take more risks. E.g. 0.9 for more creative applications, and 0 for ones with a well-defined answer.
    @return: The found label
    """
    openai.api_key = api_key
    response = None  # Define response with a default value
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": f"You are a classification assistant and need to classify texts based on their {classify_by}. You may only return one of these labels: {', '.join(labels)}. \
                                Return nothing except one of the mentioned labels. The output should only contain a single word.",
                },
                {
                    "role": "user",
                    "content": f"Text to classify: {text}",
                },
            ],
            temperature=0,
        )
        answer = response["choices"][0]["message"]["content"]
        return {"result": answer}
    except Exception as e: 
        return f"That didn't work. Got error: {e} and message: {response}"


In [4]:

# ↑ necessary bricks function 
# -----------------------------------------------------------------------------------------
# ↓ example implementation 

def example_integration():

    texts = ["Can I help or reach out in any way? I worry she may be all alone.", "I just feel like I'm falling apart." ,"My body is gross now with tubing sticking out, I feel awful all the time."]
    api_key = os.getenv('OPENAI_API_KEY') # paste your OpenAI API key here
    classify_by = "medical condition"
    labels = ["anorexic", "not anorexic"]
    
    for text in texts:
        print(f"the {classify_by} of \"{text}\" is {gpt_classifier(text, classify_by, labels, api_key)}")



In [5]:

example_integration()

the medical condition of "Can I help or reach out in any way? I worry she may be all alone." is That didn't work. Did you provide a valid API key? Got error: You exceeded your current quota, please check your plan and billing details. and message: None
the medical condition of "I just feel like I'm falling apart." is That didn't work. Did you provide a valid API key? Got error: You exceeded your current quota, please check your plan and billing details. and message: None
the medical condition of "My body is gross now with tubing sticking out, I feel awful all the time." is That didn't work. Did you provide a valid API key? Got error: You exceeded your current quota, please check your plan and billing details. and message: None


In [6]:
print(os.getenv('OPENAI_API_KEY'))

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


**LABELLING USING BERT-uncased**

Imma use BERT-base-uncased to label my data. ;)

In [7]:
# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Tokenize input
texts = ["Can I help or reach out in any way? I worry she may be all alone.", "I just feel like I'm falling apart." ,"My body is gross now with tubing sticking out, I feel awful all the time."]

inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)


In [9]:
# Assuming you have labels (0 for "not anorexic", 1 for "anorexic")
labels = torch.tensor([0, 1,0])  # Example labels

outputs = model(**inputs, labels=labels)
loss = outputs.loss

print(outputs)

SequenceClassifierOutput(loss=tensor(0.6299, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.3173, -0.2980],
        [ 0.2364, -0.2202],
        [ 0.2164, -0.1904]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [10]:
# Apply softmax to get probabilities
probs = F.softmax(outputs.logits, dim=1)

# Get the predicted labels
predicted_labels = torch.argmax(probs, dim=1)

# Define class labels
class_labels = ['not anorexic', 'anorexic']

# Map predicted labels to class names
predicted_classes = [class_labels[label] for label in predicted_labels]

print(predicted_classes)

# This definitely doesn't do what I want it to do

['not anorexic', 'not anorexic', 'not anorexic']
