In [1]:
%%capture
!pip install --upgrade openai
# !pip install --upgrade langchain
!pip install --upgrade python-dotenv
!pip install datasets -q
!pip install scikit-learn -q

In [3]:
!git clone https://github.com/NoraAlt/Mawqif-Arabic-Stance.git
!git clone https://github.com/aub-mind/arabert/

fatal: destination path 'Mawqif-Arabic-Stance' already exists and is not an empty directory.
fatal: destination path 'arabert' already exists and is not an empty directory.


In [119]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

train_data = load_dataset("NoraAlt/Mawqif_Stance-Detection", split='train')
df = train_data.to_pandas()

mapping = {None: 0, 'Favor': 1, 'Against': 2}
df['stance'] = df['stance'].apply(lambda x: mapping[x])

train_df, val_df = train_test_split(df, test_size=0.15, stratify=df['stance'], random_state=42)

reverse_mapping = {0: "None", 1: 'Favor', 2: 'Against'}
train_df['stance'] = train_df['stance'].apply(lambda x: reverse_mapping[x])
val_df['stance'] = val_df['stance'].apply(lambda x: reverse_mapping[x])

In [120]:
train_df.shape, val_df.shape

((2976, 14), (526, 14))

In [121]:
# Saving the validation set for future use in evaluation.
val_df.to_csv("val_data.csv", index=False)

In [310]:
from arabert.preprocess import ArabertPreprocessor
arabert_prep = ArabertPreprocessor(model_name="bert-base-arabertv02")

In [185]:
train_df['text'] = train_df['text'].apply(lambda text: arabert_prep.preprocess(text))
# val_df['text'] = val_df['text'].apply(lambda text: arabert_prep.preprocess(text))

In [126]:
print(train_df.text.values[0])

محافظ سوهاج . . خلو قوائم ” العزل المنزلي ” من مرضى كورونا . . و تطعيم 2 مليون مواطن ضد الفيروس URL


In [315]:
import json

def create_gpt_dataset(df, system_prompt, file_path=None, mode='train'):
    messages = []

    with open(file_path, 'w') as f:
        for idx, row in df.iterrows():
            if mode == 'train':
                messages.append(
                    {"messages": [{"role": "system", "content": system_prompt},
                                  {"role": "user", "content": row["text"]},
                                  {"role": "assistant", "content": row["stance"]}]}
                )
            elif mode == 'test':
                messages.append(
                    {"messages": [{"role": "system", "content": system_prompt},
                                  {"role": "user", "content": row["text"]}]}
                )
            else:
                raise ValueError("Invalid mode. Mode must be either 'train' or 'test'.")

            json.dump(messages[-1], f)
            f.write('\n')

    return messages


In [316]:
system_prompt = "You are an assistant that, given an Arabic tweet, detect the writers' stance (Favor, Against, or None). None means there is no evidence in the tweet to judge the author's stance, such as inquiries, or news that does not express any positive or negative position."

train_messages = create_gpt_dataset(train_df, system_prompt, file_path='train_instances.jsonl')
# val_messages = create_gpt_dataset(val_df, system_prompt, file_path='val_instances.jsonl', mode='test')

In [327]:
for _, sample in enumerate(train_messages[:5]):
    print(sample)

{'messages': [{'role': 'system', 'content': "You are an assistant that, given an Arabic tweet, detect the writers' stance (Favor, Against, or None). None means there is no evidence in the tweet to judge the author's stance, such as inquiries, or news that does not express any positive or negative position."}, {'role': 'user', 'content': 'محافظ سوهاج . . خلو قوائم ” العزل المنزلي ” من مرضى كورونا . . و تطعيم 2 مليون مواطن ضد الفيروس URL'}, {'role': 'assistant', 'content': 'None'}]}
{'messages': [{'role': 'system', 'content': "You are an assistant that, given an Arabic tweet, detect the writers' stance (Favor, Against, or None). None means there is no evidence in the tweet to judge the author's stance, such as inquiries, or news that does not express any positive or negative position."}, {'role': 'user', 'content': 'قوة التكنولوجيا و استعمال الذكاء الاصطناعي اللي موجودة لسه محدود الانتشار . . التحول الالكتروني حاليا متعطل علي قدرة و شجاعة الناس في التغيير . . التغيير في انك تتخلص من القد

In [136]:
from openai import OpenAI
from dotenv import load_dotenv
import os

client =  OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [138]:
file_response = client.files.create(file=open('train_instances.jsonl', 'rb'), purpose='fine-tune')

In [330]:
# file_response

In [143]:
# Extract the file ID from the file_response.
file_id = 'your_train_file_id'

In [144]:
response = client.fine_tuning.jobs.create(
    training_file=file_id,
    model="gpt-3.5-turbo",
)

In [331]:
# response

In [146]:
# Extract the job ID from the response.
job_id = 'your_job_id'

In [332]:
# client.fine_tuning.jobs.retrieve(job_id)

In [None]:
# Extract the model ID by running client.fine_tuning.jobs.retrieve(job_id) and get the id from key (fine_tuned_model).
model_id = 'your_ft_model_id'

In [334]:
def set_openai_params(
    model="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2,
    top_p=1,
):
    """Set OpenAI parameters."""
    openai_params = {
        'model': model,
        'temperature': temperature,
        'max_tokens': max_tokens,
        'top_p': top_p
    }
    return openai_params

def predict(params, messages):
    response = client.chat.completions.create(
        model = params['model'],
        messages = messages,
        temperature = params['temperature'],
        max_tokens = params['max_tokens'],
        top_p = params['top_p'],
    )
    return response.choices[0].message.content

In [333]:
blind_test_df = pd.read_csv('Mawqif-Arabic-Stance/Data/Mawqif_AllTargets_Blind Test.csv')
blind_test_df.shape

(619, 3)

In [309]:
params = set_openai_params(model=model_id)

def format_test(row):
    formatted_message =  [{"role": "system", "content": system_prompt},
                          {"role": "user", "content": row["text"]}]
    return formatted_message

def generate_predictions(params, test_df, output_csv_path):
    test_df_copy = test_df.copy()

    test_df_copy['text'] = test_df_copy['text'].apply(lambda text: arabert_prep.preprocess(text))
    test_df_copy['prediction'] = None

    for idx, row in test_df_copy.iterrows():
        test_message = format_test(row)
        pred = predict(params, messages=test_message)
        test_df_copy.at[idx, 'prediction'] = pred

    test_df_copy['prediction'] = test_df_copy['prediction'].fillna('NONE')
    test_df_copy['prediction'] = test_df_copy['prediction'].str.upper()

    test_df_copy = test_df_copy[['ID', 'target', 'text', 'prediction']]
    test_df_copy.columns = ['ID', 'Target', 'Tweet', 'Stance']
    test_df_copy.to_csv(output_csv_path, index=None, header=True, sep='\t')

generate_predictions(params, val_df, output_csv_path='gpt3.5_predictions.csv')

In [337]:
def load_and_format_csv(input_csv_path, output_csv_path):
    df = pd.read_csv(input_csv_path)
    df['stance'] = df['stance'].fillna('None')
    df['stance'] = df['stance'].str.upper()

    df = df[['ID', 'target', 'text', 'stance']]
    df.columns = ['ID', 'Target', 'Tweet', 'Stance']

    df.to_csv(output_csv_path, index=None, header=True, sep='\t')


load_and_format_csv('val_data.csv', 'validation.csv')

In [339]:
# Result on a 15% validation set.

!python stanceEval.py /content/validation.csv /content/gpt3.5_predictions.csv



Results for Target: Women empowerment
FAVOR     precision: 0.8889 recall: 0.9677 f-score: 0.9266
AGAINST   precision: 0.8913 recall: 0.8723 f-score: 0.8817
------------
Macro F: 0.9042




Results for Target: Covid Vaccine
FAVOR     precision: 0.8889 recall: 0.9474 f-score: 0.9172
AGAINST   precision: 0.8617 recall: 0.9101 f-score: 0.8852
------------
Macro F: 0.9012




Results for Target: Digital Transformation
FAVOR     precision: 0.8855 recall: 0.9431 f-score: 0.9134
AGAINST   precision: 0.5000 recall: 0.4118 f-score: 0.4516
------------
Macro F: 0.6825




Overall Macro F1-score across all targets: 0.8293



In [None]:
# Generating predictions for the blind test data using a model trained on 85% of the training data.

generate_predictions(params, blind_test_df, output_csv_path='gpt3.5_blindTest_pred.csv')