In [113]:
!pip install --upgrade openai



In [129]:
import pandas as pd
import openai
import matplotlib
import seaborn

In [130]:
df = pd.read_csv('Multitarget-CONAN.csv')

# Exploratory Analysis

In [131]:
df.head()

Unnamed: 0,INDEX,HATE_SPEECH,COUNTER_NARRATIVE,TARGET,VERSION
0,0,Maybe the UN could talk to those asian and afr...,"You're right, ocean pollution is one of the ma...",other,V1
1,1,‘Juice (Jews) are worse that nukes.’ Caption t...,You'd rather be nuked then?,JEWS,V1
2,2,Foreigners on UK benefits leaps 41% in 5 years...,You seem to be worried about government expend...,MIGRANTS,V1
3,3,"Jews cannot be patriots, since their allegianc...",You can have parents and grandparents born els...,JEWS,V1
4,4,"Israel, Zionism and Jewish Supremacist control...",Wouldn't you say its colonialism from many dif...,JEWS,V1


In [132]:
df.keys()

Index(['INDEX', 'HATE_SPEECH', 'COUNTER_NARRATIVE', 'TARGET', 'VERSION'], dtype='object')

In [133]:
df.shape

(5003, 5)

In [134]:
df['TARGET']

0          other
1           JEWS
2       MIGRANTS
3           JEWS
4           JEWS
          ...   
4998       WOMEN
4999       WOMEN
5000       WOMEN
5001       WOMEN
5002       WOMEN
Name: TARGET, Length: 5003, dtype: object

In [135]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5003 entries, 0 to 5002
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   INDEX              5003 non-null   int64 
 1   HATE_SPEECH        5003 non-null   object
 2   COUNTER_NARRATIVE  5003 non-null   object
 3   TARGET             5003 non-null   object
 4   VERSION            5003 non-null   object
dtypes: int64(1), object(4)
memory usage: 195.6+ KB


In [136]:
df['TARGET'].value_counts()

MUSLIMS     1335
MIGRANTS     957
WOMEN        662
LGBT+        617
JEWS         594
POC          352
other        266
DISABLED     220
Name: TARGET, dtype: int64

# Data Preparation

In [137]:
df = df.drop(['INDEX', 'TARGET', 'VERSION'], axis=1)

In [138]:
df.head()

Unnamed: 0,HATE_SPEECH,COUNTER_NARRATIVE
0,Maybe the UN could talk to those asian and afr...,"You're right, ocean pollution is one of the ma..."
1,‘Juice (Jews) are worse that nukes.’ Caption t...,You'd rather be nuked then?
2,Foreigners on UK benefits leaps 41% in 5 years...,You seem to be worried about government expend...
3,"Jews cannot be patriots, since their allegianc...",You can have parents and grandparents born els...
4,"Israel, Zionism and Jewish Supremacist control...",Wouldn't you say its colonialism from many dif...


In [139]:
df.columns = ['prompt', 'completion']

In [140]:
df.head()

Unnamed: 0,prompt,completion
0,Maybe the UN could talk to those asian and afr...,"You're right, ocean pollution is one of the ma..."
1,‘Juice (Jews) are worse that nukes.’ Caption t...,You'd rather be nuked then?
2,Foreigners on UK benefits leaps 41% in 5 years...,You seem to be worried about government expend...
3,"Jews cannot be patriots, since their allegianc...",You can have parents and grandparents born els...
4,"Israel, Zionism and Jewish Supremacist control...",Wouldn't you say its colonialism from many dif...


In [141]:
df.to_json("hs_cs.jsonl", orient='records', lines=True)

In [142]:
sum = 0
for line in df['completion']:
    sum += len(line.split(' '))
sum

123920

In [143]:
sum = 0
for line in df['prompt']:
    sum += len(line.split(' '))
sum

66165

## Data Preparation Tool

In [144]:
!openai tools fine_tunes.prepare_data -f hs_cs.jsonl -q

Analyzing...

- Your file contains 5003 prompt-completion pairs
- There are 1 duplicated prompt-completion sets. These are rows: [385]
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["prompt"] += suffix
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["completion"] += suffix
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["completion"] = x["completion"].apply(


In [145]:
import json
from sklearn.model_selection import train_test_split

with open('hs_cs_prepared.jsonl', 'r', encoding='utf8') as f:
    data = [json.loads(line) for line in f]

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

with open('hs_cs_train.jsonl', 'w') as f:
    for example in train_data:
        f.write(json.dumps(example) + '\n')

with open('hs_cs_val.jsonl', 'w') as f:
    for example in val_data:
        f.write(json.dumps(example) + '\n')

# Fine-tuning

In [146]:
import os
os.environ['OPENAI_API_KEY'] = 'INSERT OPENAI API KEY HERE'

In [169]:
openai.File.list()

<OpenAIObject list at 0x299e73b8f90> JSON: {
  "data": [
    {
      "bytes": 1019363,
      "created_at": 1682380810,
      "filename": "hs_cs_train.jsonl",
      "id": "file-rZ0lJZbusAGAO5PYYKrlKX7f",
      "object": "file",
      "purpose": "fine-tune",
      "status": "processed",
      "status_details": null
    },
    {
      "bytes": 116611,
      "created_at": 1682381611,
      "filename": "compiled_results.csv",
      "id": "file-cF0EG0XKY9H8n8uufhqQYkEZ",
      "object": "file",
      "purpose": "fine-tune-results",
      "status": "processed",
      "status_details": null
    },
    {
      "bytes": 267996,
      "created_at": 1682398423,
      "filename": "compiled_results.csv",
      "id": "file-kwTJSf4rYzYMw7YdzqsjQlGY",
      "object": "file",
      "purpose": "fine-tune-results",
      "status": "processed",
      "status_details": null
    },
    {
      "bytes": 146852,
      "created_at": 1682386164,
      "filename": "compiled_results.csv",
      "id": "file-9ehXUAC

In [172]:
# !openai api fine_tunes.create -t "hs_cs_prepared.jsonl" -m davinci
openai.FineTune.create(training_file="file-rZ0lJZbusAGAO5PYYKrlKX7f", model="davinci", suffix="hs_cs_v2")
#ft-1tXMipvwdtn4aJfpPvuqR3hN

<FineTune fine-tune id=ft-1tXMipvwdtn4aJfpPvuqR3hN at 0x299e7ab13b0> JSON: {
  "created_at": 1682435178,
  "events": [
    {
      "created_at": 1682435178,
      "level": "info",
      "message": "Created fine-tune: ft-1tXMipvwdtn4aJfpPvuqR3hN",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-1tXMipvwdtn4aJfpPvuqR3hN",
  "model": "davinci",
  "object": "fine-tune",
  "organization_id": "org-X0kzGQIW73wa0UvE4Oz6MEyu",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 1019363,
      "created_at": 1682380810,
      "filename": "hs_cs_train.jsonl",
      "id": "file-rZ0lJZbusAGAO5PYYKrlKX7f",
      "object": "file",
      "purpose": "fine-tune",
      "status": "processed",
      "status_details": null
    }
  ],
  "updated_at": 1682435178,
  "validation_files": []
}

In [153]:
!openai api fine_tunes.follow -i "ft-cqR54P9XnU4a56juBB0mUrYn"

[2023-04-24 20:09:35] Created fine-tune: ft-cqR54P9XnU4a56juBB0mUrYn
[2023-04-24 20:09:47] Fine-tune costs $2.84
[2023-04-24 20:09:47] Fine-tune enqueued. Queue number: 0
[2023-04-24 20:09:49] Fine-tune started
[2023-04-24 20:15:25] Completed epoch 1/4
[2023-04-24 20:19:59] Completed epoch 2/4
[2023-04-24 20:24:32] Completed epoch 3/4
[2023-04-24 20:29:23] Uploaded model: curie:ft-personal-2023-04-25-01-29-23
[2023-04-24 20:29:24] Uploaded result file: file-9ehXUACHFHwZKlTFjQaGaFpI
[2023-04-24 20:29:24] Fine-tune succeeded


Traceback (most recent call last):
  File "D:\Programs\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "D:\Programs\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "D:\Programs\anaconda3\Scripts\openai.exe\__main__.py", line 7, in <module>
  File "D:\Programs\anaconda3\lib\site-packages\openai\_openai_scripts.py", line 70, in main
    args.func(args)
  File "D:\Programs\anaconda3\lib\site-packages\openai\cli.py", line 516, in follow
    cls._stream_events(args.id)
  File "D:\Programs\anaconda3\lib\site-packages\openai\cli.py", line 559, in _stream_events
    sys.stdout.write("\nJob complete! Status: succeeded \U0001f389")
  File "D:\Programs\anaconda3\lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\U0001f389' in position 34: character maps to <undefined>


In [154]:
!openai api fine_tunes.get -i "ft-cqR54P9XnU4a56juBB0mUrYn"

{
  "created_at": 1682384975,
  "events": [
    {
      "created_at": 1682384975,
      "level": "info",
      "message": "Created fine-tune: ft-cqR54P9XnU4a56juBB0mUrYn",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1682384987,
      "level": "info",
      "message": "Fine-tune costs $2.84",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1682384987,
      "level": "info",
      "message": "Fine-tune enqueued. Queue number: 0",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1682384989,
      "level": "info",
      "message": "Fine-tune started",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1682385325,
      "level": "info",
      "message": "Completed epoch 1/4",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1682385599,
      "level": "info",
      "message": "Completed epoch 2/4",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1682385872,
      "level": "info",


In [171]:
!openai api fine_tunes.list

{
  "data": [
    {
      "created_at": 1682380810,
      "fine_tuned_model": "ada:ft-personal:hatespeech-counterspeech-2023-04-25-00-13-31",
      "hyperparams": {
        "batch_size": 8,
        "learning_rate_multiplier": 0.1,
        "n_epochs": 4,
        "prompt_loss_weight": 0.01
      },
      "id": "ft-bj9mfD3ofGuxY6DGAk7E6bVt",
      "model": "ada",
      "object": "fine-tune",
      "organization_id": "org-X0kzGQIW73wa0UvE4Oz6MEyu",
      "result_files": [
        {
          "bytes": 116611,
          "created_at": 1682381611,
          "filename": "compiled_results.csv",
          "id": "file-cF0EG0XKY9H8n8uufhqQYkEZ",
          "object": "file",
          "purpose": "fine-tune-results",
          "status": "processed",
          "status_details": null
        }
      ],
      "status": "succeeded",
      "training_files": [
        {
          "bytes": 1019363,
          "created_at": 1682380810,
          "filename": "hs_cs_train.jsonl",
          "id": "file-rZ0lJZbusA

In [184]:
!openai api fine_tunes.results -i "ft-1tXMipvwdtn4aJfpPvuqR3hN"

step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy
1,520,8,1.0157604844857633,0.0,0.43891402714932126
2,1232,16,0.968589484856235,0.0,0.4031620553359684
3,1816,24,1.1223166182561515,0.0,0.46153846153846156
4,2400,32,1.0937338127465288,0.0,0.4074074074074074
5,2984,40,1.164553716138856,0.0,0.45454545454545453
6,3632,48,0.8383720918190238,0.0,0.44144144144144143
7,4216,56,0.9911226583999976,0.0,0.47058823529411764
8,4992,64,0.9359179833330091,0.0,0.4711864406779661
9,5704,72,0.768012167056344,0.0,0.463519313304721
10,6352,80,1.0717841769946026,0.0,0.48
11,7064,88,1.0471723778455369,0.0,0.39492753623188404
12,7520,96,0.8224131713044854,0.0,0.5028248587570622
13,7912,104,1.0638691563808484,0.0,0.48554913294797686
14,8624,112,0.8301719688164915,0.0,0.4666666666666667
15,9656,120,0.6937756579107582,0.0,0.41637010676156583
16,10368,128,0.7304243453750943,0.0,0.4956140350877193
17,11208,136,0.7506639002731302,0.0,0.47058823529411764
18,11728,1

In [None]:
ft_model = "davinci:ft-personal:hs-cs-v2-2023-04-25-16-06-31"

In [225]:
res = openai.Completion.create(model="davinci:ft-personal:hs-cs-v2-2023-04-25-16-06-31", prompt="There can be no world peace until the last muslim has been removed from the earth. ->", max_tokens=80, stop=['->', '[', '=>'], temperature=0.3, presence_penalty=-0.8, frequency_penalty=2)


In [227]:
cs_out = res['choices'][0]['text']

# Topic Extraction

str

In [198]:
from keybert import KeyBERT
kw_model = KeyBERT()

In [236]:
doc = cs_out #"""The last time I checked, the world was in a state of peace. Maybe you should check your facts."""
keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None, highlight=True)
print(' '.join([x[0] for x in keywords]))

peace peaceful world muslims religion


In [None]:
# delicate savage / you'll never hold the cinder / but still you will burn $
# our destination / the skyline of this city / shining horizon

In [249]:
doc = "our destination / the skyline of this city / shining horizon"
keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None, highlight=True)
print(','.join([x[0] for x in keywords[:2]]))
# print(keywords[:2])

shining horizon,city shining


In [241]:
from phonemizer import phonemize
from phonemizer.separator import Separator
doc = """Awaken before dawn.
         I hear the city rising.
         The new day begins."""
phn = phonemize(doc, language='en-us', backend='festival',
                with_stress=False, separator=Separator(phone=None,
                word=' ', syllable="|"), strip=True)
print(phn)
# Output:

RuntimeError: festival not installed on your system

In [None]:
Fear and hate spread
Around the world, unchecked.
Muslims are not safe.

# Haiku Creation