<a href="https://colab.research.google.com/github/Nathan-Roll1/GreyBox/blob/main/Submission_GreyBox_SemEval_2024_Task_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Progressive Fine-tuning (for Multilingual Detection of Propaganda Techniques)**

**By:** Nathan Roll & Calbert Graham

[![GitHub Repo](https://img.shields.io/badge/GitHub-Nathan--Roll1%2FGreyBox-black?logo=github)](https://github.com/Nathan-Roll1/GreyBox)


## **Initialization**

In [None]:
# Installs
!pip install --upgrade openai langchain sklearn_hierarchical_classification

# Imports
import openai
import json
from tqdm import tqdm
import warnings  # For suppressing potential warnings

from sklearn_hierarchical_classification import HierarchicalClassifier

In [None]:
# Add OpenAI API key
openai.api_key = "YOUR_OPENAI_API_KEY"

## **Functions**

In [None]:
def load_and_format_data(subtasks):
    """Loads and prepares data from Semeval4 dataset"""

    data = []
    data_dev = []

    for subtask in subtasks:
        for split in ['train', 'validation', 'dev_labeled_en']:
            file_path = f'/content/drive/Shareddrives/SemEval4/data/{subtask}/{split}.json'
            with open(file_path, 'r') as f:
                subset = json.loads(f.read())
                for item in subset:
                    item['subtask'] = subtask
                    item['set'] = split
                if split in ['train', 'validation']:
                    data += subset
                else:
                    data_dev += subset

    return data, data_dev

In [None]:
def create_jsonl_files(data, filename):
    """Creates JSONL files for fine-tuning."""

    with open(filename, 'w') as file:
        for item in data:
            prompt = {
                "messages": [
                    {"role": "system", "content": "Identify the propaganda labels in the memes"},
                    {"role": "user", "content": item['text']},
                    {"role": "assistant", "content": str(item['labels'])}
                ]
            }
            json.dump(prompt, file)
            file.write('\n')

In [None]:
def predict_labels(text, fine_tuned_model_id):
    """Uses fine-tuned model for propaganda label prediction."""

    try:
        response = openai.Completion.create(
            model=fine_tuned_model_id,
            temperature=0, # Keep temperature at 0!
            messages=[
                {"role": "system", "content": 'Identify the propaganda labels in the memes'},
                {"role": "user", "content": text}
            ]
        )
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error predicting labels for '{text}': {e}")
        return None

In [None]:
def predict_and_update_data(data, fine_tuned_model_id):
    """Predicts labels for development data and updates the data structure."""

    with ThreadPoolExecutor(max_workers=16) as executor:
        futures = [executor.submit(predict_labels, item['text'], fine_tuned_model_id) for item in data]

        for future, item in tqdm(zip(futures, data), total=len(data)):
            labels = future.result()
            if labels:
                item['labels'] = labels[1:-1].replace("'", '').split(',')
                for i, label in enumerate(item['labels']):
                    item['labels'][i] = label.strip()
            else:
                item['labels'] = []  # Handle cases where prediction fails

In [None]:
def process_test_data(test_data_path, output_path, fine_tuned_model_id):
    """Loads test data, generates predictions, and saves results."""

    with open(test_data_path, 'r') as f:
        test_data = json.loads(f.read())

    # Use predict_and_update_data function to get predictions

    with open(output_path, 'w') as f:
        json.dump([{'id': x['id'], 'labels': x['labels']} for x in test_data], f, indent=4)

## **Implementation**

### Load the data

In [None]:
subtasks = ['subtask1', 'subtask2a', 'subtask2b']
data, data_dev = load_and_format_data(subtasks)

### Format the data for fine-tuning on the OpenAI API

In [None]:
create_jsonl_files(data, 'semeval4_1.jsonl')
create_jsonl_files(data_dev, 'semeval4_1_dev.jsonl')

### Create fine-tuning jobs (this will charge your OpenAI Account!)

In [None]:
with warnings.catch_warnings():  # Temporarily suppress potential warnings
    warnings.simplefilter('ignore')

    train_file = openai.File.create(file=open("semeval4_1.jsonl", "rb"), purpose="fine-tune")
    valid_file = openai.File.create(file=open("semeval4_1_dev.jsonl", "rb"), purpose="fine-tune")

    fine_tune_job = openai.FineTune.create(
        training_file=train_file["id"],
        validation_file=valid_file["id"],
        model="ft:gpt-3.5-turbo-1106:nathannet:all-subtasks:8ljQqAmF",  # Use a suitable model
        suffix="sub1_noleakage",
        n_epochs=2
    )

### Predict using fine-tuned model

In [None]:
predict_and_update_data(data_v, "ft:gpt-3.5-turbo-1106:nathannet:sub1-noleakage:8ln2kucx")

# Save updated data
with open('data_v.json.txt', 'w') as f:
    json.dump([{'id': x['id'], 'labels': x['labels']} for x in data_v], f, indent=4)

### Evaluate model performance on dev set

In [None]:
!python3 drive/Shareddrives/SemEval4/scorer-baseline/subtask_1_2a.py \
    --gold_file_path /content/drive/Shareddrives/SemEval4/data/subtask1/dev_labeled_en.json \
    --pred_file_path data_v.json.txt

### Example Usage

In [None]:
process_test_data(
    '/content/drive/Shareddrives/SemEval4/test_data/english/en_subtask1_test_unlabeled.json',
    'engl_sub1_test.json.txt',
    "ft:gpt-3.5-turbo-1106:nathannet:sub1-noleakage:8ln2kucx"  # Adjust model ID
)

process_test_data(
    '/content/drive/Shareddrives/SemEval4/test_data/north_macedonian/mk_subtask1_test_unlabeled.json',
    'nm_sub1_test.json.txt',
    "ft:gpt-3.5-turbo-1106:nathannet:sub1-noleakage:8ln2kucx"  # Adjust model ID
)