## Importing Libraries and Text Preprocessing

In [1]:
!pip install num2words



In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from num2words import num2words
import string


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data = pd.read_csv("/content/Scrapped_data (1).csv")

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Text,URL,ID,Time,Comment Count,Score,Comments
0,0,"In a decade of drug overdoses, more than 320,0...",,https://www.npr.org/sections/health-shots/2024...,1cnt5o8,1715247000.0,6,35,Lost a parent recently to an overdose. And mot...
1,1,‘Ozempic babies’: Reports of surprise pregnanc...,,https://www.cnn.com/2024/05/08/health/ozempic-...,1cnf2q3,1715202000.0,89,754,PCOS is a common cause of fertility troubles a...
2,2,Generative AI will be designing new drugs all ...,,https://www.cnbc.com/2024/05/05/within-a-few-y...,1clhxj9,1714999000.0,26,151,"Cool! So meds will be cheaper in the future, r..."
3,3,Diabetes drugs test a new side effect: slowing...,,https://english.elpais.com/science-tech/2024-0...,1ci7you,1714628000.0,0,30,
4,4,US drug control agency will move to reclassify...,,https://apnews.com/article/marijuana-biden-dea...,1ch317h,1714509000.0,0,20,


In [6]:
data['Text'] = data['Text'].fillna('')
data['Comments'] = data['Comments'].fillna('')

data['combined_text'] = data['Title'] + " " + data['Text'] + " " + data['Comments']
data

Unnamed: 0.1,Unnamed: 0,Title,Text,URL,ID,Time,Comment Count,Score,Comments,combined_text
0,0,"In a decade of drug overdoses, more than 320,0...",,https://www.npr.org/sections/health-shots/2024...,1cnt5o8,1.715247e+09,6,35,Lost a parent recently to an overdose. And mot...,"In a decade of drug overdoses, more than 320,0..."
1,1,‘Ozempic babies’: Reports of surprise pregnanc...,,https://www.cnn.com/2024/05/08/health/ozempic-...,1cnf2q3,1.715202e+09,89,754,PCOS is a common cause of fertility troubles a...,‘Ozempic babies’: Reports of surprise pregnanc...
2,2,Generative AI will be designing new drugs all ...,,https://www.cnbc.com/2024/05/05/within-a-few-y...,1clhxj9,1.714999e+09,26,151,"Cool! So meds will be cheaper in the future, r...",Generative AI will be designing new drugs all ...
3,3,Diabetes drugs test a new side effect: slowing...,,https://english.elpais.com/science-tech/2024-0...,1ci7you,1.714628e+09,0,30,,Diabetes drugs test a new side effect: slowing...
4,4,US drug control agency will move to reclassify...,,https://apnews.com/article/marijuana-biden-dea...,1ch317h,1.714509e+09,0,20,,US drug control agency will move to reclassify...
...,...,...,...,...,...,...,...,...,...,...
118,118,A New Treatment Rejuvenates Aging Immune Syste...,,https://singularityhub.com/2024/03/29/a-new-tr...,1bteg2a,1.712003e+09,10,200,Welcome to r/science! This is a heavily modera...,A New Treatment Rejuvenates Aging Immune Syste...
119,119,U.S. rates of suicide by all methods rose stea...,,https://www.upi.com/Health_News/2024/03/29/sui...,1bsxavp,1.711956e+09,137,1405,Welcome to r/science! This is a heavily modera...,U.S. rates of suicide by all methods rose stea...
120,120,Generative AI develops potential new drugs for...,,https://med.stanford.edu/news/all-news/2024/03...,1bs7a8p,1.711881e+09,41,456,Welcome to r/science! This is a heavily modera...,Generative AI develops potential new drugs for...
121,121,Researchers at Weill Cornell Medicine identify...,,https://www.cell.com/cancer-cell/abstract/S153...,1bry53r,1.711848e+09,3,73,Welcome to r/science! This is a heavily modera...,Researchers at Weill Cornell Medicine identify...


Text Preprocessing

In [7]:
def to_lower(text):
    return text.lower()

def remove_urls(text):
    return re.sub(r'http\S+', '', text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_non_ascii(words):
    return re.sub(r'[^\x00-\x7f]', r'', words)

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F700-\U0001F77F"
                           u"\U0001F780-\U0001F7FF"
                           u"\U0001F800-\U0001F8FF"
                           u"\U0001F900-\U0001F9FF"
                           u"\U0001FA00-\U0001FA6F"
                           u"\U0001FA70-\U0001FAFF"
                           u"\U00002702-\U000027B0"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def replace_numbers(words):
    return [num2words(word) if word.isdigit() else word for word in words.split()]

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    if isinstance(words, str):
        words = words.split()
    return ' '.join([word for word in words if word not in stop_words])


def lemmatize(words):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in words.split()])

def preprocess_text(text):
    text = to_lower(text)
    text = remove_urls(text)
    text = remove_emojis(text)
    text = remove_punctuation(text)
    text = remove_non_ascii(text)
    text = replace_numbers(text)
    text = remove_stopwords(text)
    text = lemmatize(text)
    return text


In [8]:
data['clean_text'] = data['combined_text'].apply(preprocess_text)

In [9]:
data

Unnamed: 0.1,Unnamed: 0,Title,Text,URL,ID,Time,Comment Count,Score,Comments,combined_text,clean_text
0,0,"In a decade of drug overdoses, more than 320,0...",,https://www.npr.org/sections/health-shots/2024...,1cnt5o8,1.715247e+09,6,35,Lost a parent recently to an overdose. And mot...,"In a decade of drug overdoses, more than 320,0...",decade drug overdoses three hundred and twenty...
1,1,‘Ozempic babies’: Reports of surprise pregnanc...,,https://www.cnn.com/2024/05/08/health/ozempic-...,1cnf2q3,1.715202e+09,89,754,PCOS is a common cause of fertility troubles a...,‘Ozempic babies’: Reports of surprise pregnanc...,ozempic baby report surprise pregnancy raise n...
2,2,Generative AI will be designing new drugs all ...,,https://www.cnbc.com/2024/05/05/within-a-few-y...,1clhxj9,1.714999e+09,26,151,"Cool! So meds will be cheaper in the future, r...",Generative AI will be designing new drugs all ...,generative ai designing new drug near future c...
3,3,Diabetes drugs test a new side effect: slowing...,,https://english.elpais.com/science-tech/2024-0...,1ci7you,1.714628e+09,0,30,,Diabetes drugs test a new side effect: slowing...,diabetes drug test new side effect slowing sym...
4,4,US drug control agency will move to reclassify...,,https://apnews.com/article/marijuana-biden-dea...,1ch317h,1.714509e+09,0,20,,US drug control agency will move to reclassify...,u drug control agency move reclassify marijuan...
...,...,...,...,...,...,...,...,...,...,...,...
118,118,A New Treatment Rejuvenates Aging Immune Syste...,,https://singularityhub.com/2024/03/29/a-new-tr...,1bteg2a,1.712003e+09,10,200,Welcome to r/science! This is a heavily modera...,A New Treatment Rejuvenates Aging Immune Syste...,new treatment rejuvenates aging immune system ...
119,119,U.S. rates of suicide by all methods rose stea...,,https://www.upi.com/Health_News/2024/03/29/sui...,1bsxavp,1.711956e+09,137,1405,Welcome to r/science! This is a heavily modera...,U.S. rates of suicide by all methods rose stea...,u rate suicide method rose steadily adolescent...
120,120,Generative AI develops potential new drugs for...,,https://med.stanford.edu/news/all-news/2024/03...,1bs7a8p,1.711881e+09,41,456,Welcome to r/science! This is a heavily modera...,Generative AI develops potential new drugs for...,generative ai develops potential new drug anti...
121,121,Researchers at Weill Cornell Medicine identify...,,https://www.cell.com/cancer-cell/abstract/S153...,1bry53r,1.711848e+09,3,73,Welcome to r/science! This is a heavily modera...,Researchers at Weill Cornell Medicine identify...,researcher weill cornell medicine identify mut...


Assigning Sentiment score

In [10]:
from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'


In [11]:
data['sentiment'] = data['clean_text'].apply(get_sentiment)

In [12]:
data['sentiment'] = data['sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0})

In [13]:
data

Unnamed: 0.1,Unnamed: 0,Title,Text,URL,ID,Time,Comment Count,Score,Comments,combined_text,clean_text,sentiment
0,0,"In a decade of drug overdoses, more than 320,0...",,https://www.npr.org/sections/health-shots/2024...,1cnt5o8,1.715247e+09,6,35,Lost a parent recently to an overdose. And mot...,"In a decade of drug overdoses, more than 320,0...",decade drug overdoses three hundred and twenty...,2
1,1,‘Ozempic babies’: Reports of surprise pregnanc...,,https://www.cnn.com/2024/05/08/health/ozempic-...,1cnf2q3,1.715202e+09,89,754,PCOS is a common cause of fertility troubles a...,‘Ozempic babies’: Reports of surprise pregnanc...,ozempic baby report surprise pregnancy raise n...,2
2,2,Generative AI will be designing new drugs all ...,,https://www.cnbc.com/2024/05/05/within-a-few-y...,1clhxj9,1.714999e+09,26,151,"Cool! So meds will be cheaper in the future, r...",Generative AI will be designing new drugs all ...,generative ai designing new drug near future c...,2
3,3,Diabetes drugs test a new side effect: slowing...,,https://english.elpais.com/science-tech/2024-0...,1ci7you,1.714628e+09,0,30,,Diabetes drugs test a new side effect: slowing...,diabetes drug test new side effect slowing sym...,2
4,4,US drug control agency will move to reclassify...,,https://apnews.com/article/marijuana-biden-dea...,1ch317h,1.714509e+09,0,20,,US drug control agency will move to reclassify...,u drug control agency move reclassify marijuan...,1
...,...,...,...,...,...,...,...,...,...,...,...,...
118,118,A New Treatment Rejuvenates Aging Immune Syste...,,https://singularityhub.com/2024/03/29/a-new-tr...,1bteg2a,1.712003e+09,10,200,Welcome to r/science! This is a heavily modera...,A New Treatment Rejuvenates Aging Immune Syste...,new treatment rejuvenates aging immune system ...,2
119,119,U.S. rates of suicide by all methods rose stea...,,https://www.upi.com/Health_News/2024/03/29/sui...,1bsxavp,1.711956e+09,137,1405,Welcome to r/science! This is a heavily modera...,U.S. rates of suicide by all methods rose stea...,u rate suicide method rose steadily adolescent...,2
120,120,Generative AI develops potential new drugs for...,,https://med.stanford.edu/news/all-news/2024/03...,1bs7a8p,1.711881e+09,41,456,Welcome to r/science! This is a heavily modera...,Generative AI develops potential new drugs for...,generative ai develops potential new drug anti...,2
121,121,Researchers at Weill Cornell Medicine identify...,,https://www.cell.com/cancer-cell/abstract/S153...,1bry53r,1.711848e+09,3,73,Welcome to r/science! This is a heavily modera...,Researchers at Weill Cornell Medicine identify...,researcher weill cornell medicine identify mut...,2


In [14]:
distribution = data['sentiment'].value_counts().sort_index()
print(distribution)

sentiment
0     11
1      7
2    105
Name: count, dtype: int64


Tokenisation

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['clean_text'])
y = data['sentiment']


Train Test Split

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Handling class imbalance

In [22]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42, k_neighbors=4)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


## Naive Bayes and Random Forest

In [23]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_resampled, y_train_resampled)


In [24]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)


In [25]:
nb_predictions = nb_classifier.predict(X_test)
rf_predictions = rf_classifier.predict(X_test)


In [26]:
from sklearn.metrics import accuracy_score, classification_report

nb_accuracy = accuracy_score(y_test, nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))

rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", classification_report(y_test, rf_predictions))


Naive Bayes Accuracy: 0.8064516129032258
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       1.00      0.50      0.67         2
           2       0.83      0.96      0.89        25

    accuracy                           0.81        31
   macro avg       0.61      0.49      0.52        31
weighted avg       0.73      0.81      0.76        31

Random Forest Accuracy: 0.7741935483870968
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.33      1.00      0.50         2
           2       0.88      0.88      0.88        25

    accuracy                           0.77        31
   macro avg       0.40      0.63      0.46        31
weighted avg       0.73      0.77      0.74        31



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## BERT and RoBERTA

In [27]:
!pip install transformers[torch]



In [28]:
!pip install accelerate>=0.21.0


In [29]:
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [30]:
class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=512):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length",
                              max_length=self.max_len)

    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': label
    }

In [31]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'bert-base-uncased'
device = "cuda"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint_roberta = 'roberta-base'
device = "cuda"

tokenizer_roberta = AutoTokenizer.from_pretrained(checkpoint_roberta)
model_roberta = AutoModelForSequenceClassification.from_pretrained(checkpoint_roberta, num_labels=3).to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
X = data['clean_text'].tolist()
y = data['sentiment'].tolist()

dataset = CustomDataset(X, y, tokenizer)

In [33]:
dataset[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

Hadnling class imbalance through class weights

In [34]:
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

In [35]:
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

In [36]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(example):
  labels = example.label_ids
  preds = example.predictions.argmax(-1)

  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)

  return {'accuracy': acc, "f1": f1}

In [41]:
from transformers import Trainer
from torch.nn import CrossEntropyLoss

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        num_labels = model.config.num_labels
        if self.class_weights is not None:
            loss_fct = CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
        else:
            loss = outputs.loss if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss


In [42]:
batch_size = 4
model_name = "bert_finetuned_sentiment"

args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=2e-5,
    num_train_epochs=10,
    evaluation_strategy='epoch'
)


In [43]:
trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    class_weights=class_weights_tensor
)


In [58]:
trainer_roberta = CustomTrainer(
    model=model_roberta,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    class_weights=class_weights_tensor
)


In [44]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.017976,0.76,0.656364
2,No log,1.178768,0.76,0.656364
3,No log,1.538666,0.76,0.656364
4,No log,1.629925,0.76,0.656364
5,No log,1.663764,0.76,0.656364
6,No log,1.545254,0.76,0.656364
7,No log,1.454971,0.8,0.724961
8,No log,1.498875,0.8,0.724961
9,No log,1.503768,0.84,0.767619
10,No log,1.48407,0.84,0.767619


TrainOutput(global_step=250, training_loss=0.744598388671875, metrics={'train_runtime': 104.4751, 'train_samples_per_second': 9.38, 'train_steps_per_second': 2.393, 'total_flos': 257851149373440.0, 'train_loss': 0.744598388671875, 'epoch': 10.0})

In [59]:
trainer_roberta.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.613184,0.76,0.656364
2,No log,1.592454,0.76,0.656364
3,No log,2.232478,0.76,0.656364
4,No log,2.110331,0.76,0.656364
5,No log,1.458706,0.84,0.767619
6,No log,2.238415,0.76,0.656364
7,No log,2.035693,0.8,0.724961
8,No log,2.169079,0.8,0.724961
9,No log,2.174945,0.8,0.724961
10,No log,2.109477,0.8,0.724961


TrainOutput(global_step=250, training_loss=0.9809175415039062, metrics={'train_runtime': 103.3034, 'train_samples_per_second': 9.487, 'train_steps_per_second': 2.42, 'total_flos': 257851149373440.0, 'train_loss': 0.9809175415039062, 'epoch': 10.0})

In [45]:
trainer.save_model(model_name)

In [46]:
tok = AutoTokenizer.from_pretrained(model_name)
mod = AutoModelForSequenceClassification.from_pretrained(model_name)

In [47]:
id2label = {2: 'positive', 1: 'neutral' ,0: 'negative'}

In [48]:
def get_prediction(text):
  input_ids = tok.encode(text, return_tensors='pt')
  output = mod(input_ids)

  preds = torch.nn.functional.softmax(output.logits, dim=-1)

  prob = torch.max(preds).item()

  idx = torch.argmax(preds).item()
  sentiment = id2label[idx]

  return {'sentiment':sentiment, 'prob':prob}

In [50]:
text = "Evaluating clinical trials"
get_prediction(text)

{'sentiment': 'neutral', 'prob': 0.9122727513313293}

In [51]:
!pip install openai

Collecting openai
  Downloading openai-1.28.0-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 

In [52]:
from openai import OpenAI

client = OpenAI(
    # api_key = 'sk-proj-1FSeA294vRw5fdIXlIJ0T3BlbkFJ9x5EbYS2cS9R4UBQSOHS'
    api_key = "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"

)

In [53]:
def display_chat_history(messages):
    for message in messages:
        print(f"{message['role'].capitalize()}: {message['content']}")

In [54]:
def get_assistant_response(messages):
    r = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": m["role"], "content": m["content"]} for m in messages],
    )
    response = r.choices[0].message.content
    return response


In [60]:
def get_assistant_response(messages, sentiment):
    prompt = "Generate a personalized message encouraging a user to participate in a clinical trial."
    if sentiment['sentiment'] == 'positive':
        prompt += " The user is positive about clinical trials."
    elif sentiment['sentiment'] == 'neutral':
        prompt += " The user is neutral about clinical trials. Provide more information to encourage participation."
    else:
      prompt += " The user is negative about clinical trials. Provide more information to encourage participation and help debunk any myths."

    r = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": m["role"], "content": m["content"]} for m in messages] + [{"role": "system", "content": prompt}]
    )
    response = r.choices[0].message.content
    return response


messages = [{"role": "assistant", "content": "How can I help you regarding clinical trials?"}]

while True:
    display_chat_history(messages)

    user_input = input("User (press 1 to end): ")
    if user_input == "1":
        print("Conversation ended. Thank you!")
        break

    messages.append({"role": "user", "content": user_input})

    sentiment = get_prediction(user_input)
    assistant_response = get_assistant_response(messages, sentiment)
    messages.append({"role": "assistant", "content": assistant_response})


Assistant: How can I help you regarding clinical trials?
User (press 1 to end): I hate clinical trials
Assistant: How can I help you regarding clinical trials?
User: I hate clinical trials
Assistant: I understand your hesitation about clinical trials, but participating in one can actually be a valuable and meaningful experience. By participating, you have the opportunity to contribute to scientific research that can lead to new treatments and advancements in healthcare. Your involvement can make a real difference in the lives of others and potentially improve your own health outcomes. Additionally, participating in clinical trials allows you to receive access to cutting-edge treatments and expert medical care that you may not otherwise have access to. Your participation is crucial in helping researchers make medical breakthroughs that can benefit patients now and in the future. Thank you for considering the opportunity to be a part of something bigger than yourself.
User (press 1 to en