# IEEE Challenge

In [1]:
import pandas as pd
import numpy as np
from src import paths
from src.read_data import DataReader
from src.data_preprocessor import DataPreprocessor
from src.open_ai import PromptRefiner, client, Models
from src.utils.util_functions import key_with_max_value, ordered_values, unpack, unpack_short

16


In [2]:
labelled = DataReader.get_posts_with_labels()
labelled.head()

Unnamed: 0,post,post_risk
0,Just kill me. Please! Just end my life! I beg ...,ideation
1,"There is no today, nor tommorrow. I dont have ...",behavior
2,Life so full of contradictions that it's not w...,indicator
3,I think I'm going to kill myself soon. I don't...,behavior
4,whats the point of living. no really. is there...,ideation


In [3]:
labelled['post_risk'].value_counts()

post_risk
ideation     190
behavior     140
indicator    129
attempt       41
Name: count, dtype: int64

In [4]:
labelled = DataPreprocessor.preprocess(labelled)
labelled.head()

2024-08-12 18:09:27,989 - INFO: Preprocesssing data
2024-08-12 18:09:27,991 - INFO: Removing HTML, non ASCI etc
2024-08-12 18:09:28,220 - INFO: Counting words
2024-08-12 18:09:28,342 - INFO: Replacing cateogries with one token labels: {'ideation': 'id', 'behavior': 'be', 'indicator': 'in', 'attempt': 'at'}


Unnamed: 0,post,post_risk,word_count
0,Just kill me. Please! Just end my life! I beg ...,id,14
1,"There is no today, nor tommorrow. I dont have ...",be,282
2,Life so full of contradictions that it's not w...,in,311
3,I think I'm going to kill myself soon. I don't...,be,54
4,whats the point of living. no really. is there...,id,142


In [5]:
labelled.groupby('post_risk')['word_count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
post_risk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
at,41.0,338.341463,427.035046,11.0,55.0,179.0,447.0,2095.0
be,140.0,177.314286,186.416841,2.0,55.0,94.0,230.25,830.0
id,190.0,161.473684,170.525415,6.0,54.0,98.5,214.0,998.0
in,129.0,135.333333,144.509047,1.0,44.0,90.0,188.0,1120.0


In [6]:
labelled['post'] = labelled['post'] + " ->"

In [7]:
from sklearn.model_selection import train_test_split
df_train_labelled, df_test = train_test_split(labelled, test_size=0.1, random_state=42, stratify=labelled["post_risk"])
df_test.head(1)

Unnamed: 0,post,post_risk,word_count
50,I feel so shattered. I don't even know any mor...,in,92


In [8]:
file_name = "data_api"
train_file = f"{file_name}_prepared_train"
validation_file = f"{file_name}_prepared_valid"

import os
for file in [file_name, train_file, validation_file]:
    if os.path.exists(file):
        os.remove(file)

In [13]:
DataPreprocessor.to_json(df_train_labelled, "data_api_prepared_train_raw.jsonl")
DataPreprocessor.to_json(df_test, "data_api_prepared_valid_raw.jsonl")

#### OpenAI API tools
Next we use openAI API tools to prepare the data further

In [None]:
!openai tools fine_tunes.prepare_data -f data_api_prepared_train_raw.jsonl

Next we submit a fine tuning job using the API and the preprocessed data. We have a range of models available:

In [14]:
for model in Models:
    print(model.value)

gpt-3.5-turbo-0125
gpt-3.5-turbo-1106
gpt-3.5-turbo-0613
babbage-002
davinci-002
gpt-4-0613


In [15]:
from src.open_ai import client
train_file = client.files.create(file=open("data_api_prepared_train_raw_prepared.jsonl", "rb"), purpose="fine-tune")
valid_file = client.files.create(file=open("data_api_prepared_valid_raw_prepared.jsonl", "rb"), purpose="fine-tune")

In [34]:
fine_tuning_job = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=valid_file.id,  
    model="gpt-4o-mini-2024-07-18", 
    hyperparameters={
    "n_epochs":5
  })
print(fine_tuning_job)

FineTuningJob(id='ftjob-oDuYIWahFMoIKFhi2N4IQiGa', created_at=1723480757, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=5, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-FM8nE4u0PEcj6crKCzO0dFT7', result_files=[], seed=644290663, status='validating_files', trained_tokens=None, training_file='file-nnCXuoGxX5KimfVHeLMQqLED', validation_file='file-PLUwcJPtLZQbOO0cDXrGFC4W', estimated_finish=None, integrations=[], user_provided_suffix=None)


Job tuning will take some time. You can view the progress at: https://platform.openai.com/finetune/

In [41]:
still_running = True
fine_tuning_job_id = fine_tuning_job.id
fine_tuning_job_id ="ftjob-hLINDFyTJCcf5ReM23qIRxnM"
import time
while still_running:
    fine_tune_results = client.fine_tuning.jobs.retrieve(fine_tuning_job_id)
    
    if fine_tune_results.finished_at:
        still_running=False
    else:
        print("still running")
        time.sleep(10)

In [43]:
fine_tune_results

FineTuningJob(id='ftjob-hLINDFyTJCcf5ReM23qIRxnM', created_at=1718546568, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:davinci-002:personal::9al1pZiD', finished_at=1718547612, hyperparameters=Hyperparameters(n_epochs=3, batch_size=7, learning_rate_multiplier=16), model='davinci-002', object='fine_tuning.job', organization_id='org-FM8nE4u0PEcj6crKCzO0dFT7', result_files=['file-fdergvs0CfQgt8R1YaaWiatR'], seed=1281101839, status='succeeded', trained_tokens=2298876, training_file='file-tiCcKPKu2BDf3kvEgDDNviTT', validation_file='file-5ia7XcEPhdlZqk0K3VCqNufi', estimated_finish=None, integrations=[], user_provided_suffix=None)

2024-08-12 18:14:47,774 - INFO: Preprocesssing data
2024-08-12 18:14:47,775 - INFO: Removing HTML, non ASCI etc
2024-08-12 18:14:47,792 - INFO: Counting words


Unnamed: 0,post,word_count
0,My mind is hell. After my breakup 6 months ago...,307
1,What's the point?. It's been 548 days since my...,482
2,I'm scared therapy wont work. My parents reali...,115
3,so bye. love you bitches. only a few hours lef...,98
4,Relapsed after 6 months. I don't think I'll ev...,47


In [18]:
fine_tuning_job.id

'ftjob-x0MXI2OzXhUNCWOKT2XMIE0f'

In [22]:
# last job id 'ftjob-26SCSYKJlwPoNupKn5QvasUA'.
fine_tune_results = client.fine_tuning.jobs.retrieve(fine_tuning_job.id)
print(fine_tune_results)

FineTuningJob(id='ftjob-x0MXI2OzXhUNCWOKT2XMIE0f', created_at=1723479245, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:davinci-002:personal::9vS9e5Ow', finished_at=1723479948, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=16), model='davinci-002', object='fine_tuning.job', organization_id='org-FM8nE4u0PEcj6crKCzO0dFT7', result_files=['file-YKeZ3Y6qVUroHs9DBQGfzJWo'], seed=305615375, status='succeeded', trained_tokens=268383, training_file='file-nnCXuoGxX5KimfVHeLMQqLED', validation_file='file-PLUwcJPtLZQbOO0cDXrGFC4W', estimated_finish=None, integrations=[], user_provided_suffix=None)


In [23]:
ft_model = fine_tune_results.fine_tuned_model

In [None]:
from copy import deepcopy
result = deepcopy(res)

In [None]:
test = DataReader.get_test_set()
test = DataPreprocessor.preprocess(test)
test.head()
result = [client.completions.create(model=ft_model, prompt=ex + ' ->', max_tokens=1, temperature=0,logprobs=10) for ex in test['post']]

In [None]:
[res.choices[0].logprobs.top_logprobs[0] for res in result]

In [None]:
probs = [unpack_short(res) for res in result]

In [None]:
probs[:2]

In [None]:
test['suicide risk'] = [key_with_max_value(elem) for elem in probs]
test['probability distribution'] = [ordered_values(elem) for elem in probs]
test

In [None]:
test[['suicide risk', 'probability distribution']].to_excel("Calculators.xlsx", index=True, index_label="index")

In [None]:
test['suicide risk'].value_counts()

Evaluation

In [24]:
result = [client.completions.create(model=ft_model, prompt=ex + ' ->', max_tokens=1, temperature=0,logprobs=10) for ex in df_test['post']]

In [26]:
probs = [unpack_short(res) for res in result]
probs[0]

{'ideation': 0.1368371508786148,
 'behavior': 0.5912782028623174,
 'indicator': 0.21626232514324442,
 'attempt': 0.05166186761519605}

In [27]:
df_test['suicide risk'] = [key_with_max_value(elem) for elem in probs]
df_test['probability distribution'] = [ordered_values(elem) for elem in probs]
df_test.head(1)

Unnamed: 0,post,post_risk,word_count,suicide risk,probability distribution
50,I feel so shattered. I don't even know any mor...,in,92,behavior,"[0.21626, 0.13684, 0.59128, 0.05166]"


In [30]:
mapping = {'ideation':'id', 'behavior': 'be', 'indicator': 'in', 'attempt': 'at'}
df_test['pred'] = df_test['suicide risk'].map(mapping)
df_test.head(1)                                              

Unnamed: 0,post,post_risk,word_count,suicide risk,probability distribution,pred
50,I feel so shattered. I don't even know any mor...,in,92,behavior,"[0.21626, 0.13684, 0.59128, 0.05166]",be


In [32]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(df_test["post_risk"],df_test["pred"], labels= list(mapping.values())))

              precision    recall  f1-score   support

          id       0.50      0.26      0.34        19
          be       0.13      0.14      0.14        14
          in       0.24      0.46      0.32        13
          at       0.00      0.00      0.00         4

    accuracy                           0.26        50
   macro avg       0.22      0.22      0.20        50
weighted avg       0.29      0.26      0.25        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
