# IEEE Challenge

In [1]:
import pandas as pd
import numpy as np
from src import paths
from src.read_data import DataReader
from src.data_preprocessor import DataPreprocessor
from src.open_ai import PromptRefiner, client, Models
from src.utils.util_functions import key_with_max_value, ordered_values, unpack, unpack_short

16


In [2]:
labelled = DataReader.get_posts_with_labels()
labelled.head()

Unnamed: 0,post,post_risk
0,Just kill me. Please! Just end my life! I beg ...,ideation
1,"There is no today, nor tommorrow. I dont have ...",behavior
2,Life so full of contradictions that it's not w...,indicator
3,I think I'm going to kill myself soon. I don't...,behavior
4,whats the point of living. no really. is there...,ideation


In [3]:
labelled['post_risk'].value_counts()

post_risk
ideation     190
behavior     140
indicator    129
attempt       41
Name: count, dtype: int64

In [4]:
labelled = DataPreprocessor.preprocess(labelled)
labelled.head()

2024-06-16 14:44:03,513 - INFO: Preprocesssing data
2024-06-16 14:44:03,515 - INFO: Removing HTML, non ASCI etc
2024-06-16 14:44:03,590 - INFO: Counting words
2024-06-16 14:44:03,658 - INFO: Replacing cateogries with one token labels: {'ideation': 'id', 'behavior': 'be', 'indicator': 'in', 'attempt': 'at'}


Unnamed: 0,post,post_risk,word_count
0,Just kill me. Please! Just end my life! I beg ...,id,14
1,"There is no today, nor tommorrow. I dont have ...",be,282
2,Life so full of contradictions that it's not w...,in,311
3,I think I'm going to kill myself soon. I don't...,be,54
4,whats the point of living. no really. is there...,id,142


In [5]:
labelled.groupby('post_risk')['word_count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
post_risk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
at,41.0,338.341463,427.035046,11.0,55.0,179.0,447.0,2095.0
be,140.0,177.314286,186.416841,2.0,55.0,94.0,230.25,830.0
id,190.0,161.473684,170.525415,6.0,54.0,98.5,214.0,998.0
in,129.0,135.333333,144.509047,1.0,44.0,90.0,188.0,1120.0


In [6]:
from src.open_ai import create_new_posts, client

new_posts = create_new_posts(labelled, client)
new_posts.to_csv(paths.INTERMEDIATE_DATA_PATH / "prompt_generated.csv")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [37:03<00:00,  4.45s/it, index=499]


In [8]:
labelled = new_posts

In [11]:
labelled['post'] = labelled['post'] + " ->"

In [12]:
file_name = "data_api"
train_file = f"{file_name}_prepared_train"
validation_file = f"{file_name}_prepared_valid"

import os
for file in [file_name, train_file, validation_file]:
    if os.path.exists(file):
        os.remove(file)

DataPreprocessor.to_json(labelled, f'{file_name}.jsonl')

#### OpenAI API tools
Next we use openAI API tools to prepare the data further

In [13]:
!openai tools fine_tunes.prepare_data -f data_api.jsonl -q

Analyzing...

- Your file contains 5000 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 42 duplicated prompt-completion sets. These are rows: [53, 54, 55, 57, 58, 59, 725, 1786, 1787, 1788, 1789, 1831, 1832, 1833, 1834, 1836, 1837, 1838, 1839, 2478, 2921, 2922, 2923, 2924, 2925, 2926, 2927, 2928, 2929, 2953, 2955, 2957, 2959, 3051, 3057, 3068, 3102, 3103, 3108, 3109, 3596, 4377]
- All prompts end with suffix ` ->`
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details

Based on the analysis we will perform the follo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["completion"] = x["completion"].apply(lambda s: ("" if s.startswith(" ") else " ") + s)


Next we submit a fine tuning job using the API and the preprocessed data. We have a range of models available:

In [14]:
for model in Models:
    print(model.value)

gpt-3.5-turbo-0125
gpt-3.5-turbo-1106
gpt-3.5-turbo-0613
babbage-002
davinci-002
gpt-4-0613


In [15]:
from src.open_ai import client
train_file = client.files.create(file=open(f"{train_file}.jsonl", "rb"), purpose="fine-tune")
valid_file = client.files.create(file=open(f"{validation_file}.jsonl", "rb"), purpose="fine-tune")

In [16]:
fine_tuning_job = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=valid_file.id, 
    model="davinci-002")
print(fine_tuning_job)

FineTuningJob(id='ftjob-DaX9oOjV9M0ii9YiGSs0QiU4', created_at=1718544621, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='davinci-002', object='fine_tuning.job', organization_id='org-FM8nE4u0PEcj6crKCzO0dFT7', result_files=[], seed=83554818, status='validating_files', trained_tokens=None, training_file='file-GN1vRDMtSH6Tk4r1XOfQ3Gh1', validation_file='file-7JL1cNcs08fityDGa9ggQcp2', estimated_finish=None, integrations=[], user_provided_suffix=None)


Job tuning will take some time. You can view the progress at: https://platform.openai.com/finetune/

In [17]:
test = DataReader.get_test_set()
test = DataPreprocessor.preprocess(test)
test.head()

2024-06-16 15:47:55,595 - INFO: Preprocesssing data
2024-06-16 15:47:55,599 - INFO: Removing HTML, non ASCI etc
2024-06-16 15:47:55,677 - INFO: Counting words


Unnamed: 0,post,word_count
0,My mind is hell. After my breakup 6 months ago...,307
1,What's the point?. It's been 548 days since my...,482
2,I'm scared therapy wont work. My parents reali...,115
3,so bye. love you bitches. only a few hours lef...,98
4,Relapsed after 6 months. I don't think I'll ev...,47


In [19]:
fine_tuning_job.id

'ftjob-DaX9oOjV9M0ii9YiGSs0QiU4'

In [20]:
# last job id 'ftjob-26SCSYKJlwPoNupKn5QvasUA'.
fine_tune_results = client.fine_tuning.jobs.retrieve(fine_tuning_job.id)
print(fine_tune_results)

FineTuningJob(id='ftjob-DaX9oOjV9M0ii9YiGSs0QiU4', created_at=1718544621, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:davinci-002:personal::9akQ6ELL', finished_at=1718545273, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=16), model='davinci-002', object='fine_tuning.job', organization_id='org-FM8nE4u0PEcj6crKCzO0dFT7', result_files=['file-Ood5Et32elaKCgMrzikgXZC4'], seed=83554818, status='succeeded', trained_tokens=249657, training_file='file-GN1vRDMtSH6Tk4r1XOfQ3Gh1', validation_file='file-7JL1cNcs08fityDGa9ggQcp2', estimated_finish=None, integrations=[], user_provided_suffix=None)


In [22]:
ft_model = fine_tune_results.fine_tuned_model

# note that this calls the legacy completions api - https://platform.openai.com/docs/api-reference/completions
res = client.completions.create(model=ft_model, prompt=test['post'], max_tokens=1, temperature=0,logprobs=10)

In [24]:
result = [client.completions.create(model=ft_model, prompt=ex + ' ->', max_tokens=1, temperature=0,logprobs=10) for ex in test['post']]

In [25]:
[res.choices[0].logprobs.top_logprobs[0] for res in result]

[{' in': -1.0821025,
  ' be': -1.1956604,
  ' id': -1.3661449,
  ' at': -2.4358518,
  ' on': -7.122365},
 {' in': -1.085778,
  ' id': -1.2341831,
  ' be': -1.3483684,
  ' at': -2.354759,
  ' on': -7.052865},
 {' id': -1.1686672,
  ' in': -1.1717838,
  ' be': -1.2583088,
  ' at': -2.517449,
  ' on': -7.205194},
 {' in': -1.0245367,
  ' be': -1.0514237,
  ' id': -1.675327,
  ' at': -2.4214358,
  ' on': -7.1242537},
 {' in': -1.125022,
  ' id': -1.2412816,
  ' be': -1.2432233,
  ' at': -2.4947224,
  ' on': -7.187089},
 {' in': -1.1696413,
  ' id': -1.1737878,
  ' be': -1.2599704,
  ' at': -2.5034196,
  ' on': -7.1956463},
 {' be': -0.4104143,
  ' in': -1.707503,
  ' id': -2.4530034,
  ' at': -3.0143037,
  ' on': -7.063235},
 {' in': -1.0496625,
  ' be': -1.0581225,
  ' id': -1.610937,
  ' at': -2.456006,
  ' on': -7.0595512},
 {' id': -0.027419496,
  ' in': -4.2176585,
  ' at': -5.0011687,
  ' be': -5.2659554,
  ' i': -9.779886},
 {' be': -1.1082782,
  ' in': -1.1188515,
  ' id': -1.39754

In [26]:
probs = [unpack_short(res) for res in result]

In [27]:
probs[:2]

[{'ideation': 0.259024367443994,
  'behavior': 0.3071716279483455,
  'indicator': 0.3441110895809018,
  'attempt': 0.08887360929007888},
 {'ideation': 0.29575833952441394,
  'behavior': 0.263843836666332,
  'indicator': 0.3430745602433139,
  'attempt': 0.09644441194475144}]

In [28]:
test['suicide risk'] = [key_with_max_value(elem) for elem in probs]
test['probability distribution'] = [ordered_values(elem) for elem in probs]
test

Unnamed: 0,post,word_count,suicide risk,probability distribution
0,My mind is hell. After my breakup 6 months ago...,307,indicator,"[0.34411, 0.25902, 0.30717, 0.08887]"
1,What's the point?. It's been 548 days since my...,482,indicator,"[0.34307, 0.29576, 0.26384, 0.09644]"
2,I'm scared therapy wont work. My parents reali...,115,ideation,"[0.31417, 0.31515, 0.28813, 0.0818]"
3,so bye. love you bitches. only a few hours lef...,98,indicator,"[0.36434, 0.19005, 0.35467, 0.09012]"
4,Relapsed after 6 months. I don't think I'll ev...,47,indicator,"[0.32946, 0.2933, 0.29273, 0.08374]"
...,...,...,...,...
95,"""Help me die. Hello, I'm sorry, I'm going to t...",175,indicator,"[0.34581, 0.24275, 0.32474, 0.08589]"
96,"I'm still here, but I had enough of this. Hey ...",128,indicator,"[0.35416, 0.25849, 0.29101, 0.09555]"
97,having a little bit of money helps but i'm sti...,275,ideation,"[0.30792, 0.32084, 0.28521, 0.08534]"
98,I need therapy more than anything else but I h...,117,ideation,"[0.28982, 0.37505, 0.26472, 0.06981]"


In [30]:
test[['suicide risk', 'probability distribution']].to_excel("Calculators.xlsx", index=True, index_label="index")

In [29]:
test['suicide risk'].value_counts()

suicide risk
indicator    51
ideation     26
behavior     23
Name: count, dtype: int64