In [6]:
import os
import json
import requests
from tqdm.auto import tqdm

In [7]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # suppress requests warnings of InsecureRequestWarning

# Exploring Allam APIs

In [3]:
ACCESS_TOKEN = 'xdv6hDyIRvQapMeJ8WjKkoYA9787nkqdXq358jkh37iUlVnHhrHeBP10l3Fa7hwA3PJjlaZZeDXNe5W6DCXII0Sgkw78dDq7gzbM6vjuKgFNVNjWAiSPsYaYOMAGAm56'

In [4]:
def get_chat_completion(messages):
    payload = {
        "messages": messages,
        #"temperature": 0.6,
        "temperature": 0.0,
        "stream": False,
        "model": "allam",
        #"top_p": 0.98,
        "n": 1,
        "add_generation_prompt": True,
        "echo": False,
        "stop": " </s>",
    }
    response = requests.post(
        'https://vllm-v19.allam.ai/v1/chat/completions',
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {ACCESS_TOKEN}"
        },
        data=json.dumps(payload),
        timeout=150,
        verify=False,
    )
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the response JSON
        chat_response_data = response.json()
        # print(chat_response_data)
        # return chat_response_data['choices'][0]['message']['content']
        return chat_response_data
    else:
        print(f"Request failed with status code {response.status_code}")
        print(response.text)
        return None

test the API

In [5]:
print(get_chat_completion([
    {
        'role': 'user',
        'content': 'ما هو لون نجوم السماء؟'
    },
]))

{'id': 'cmpl-e0fc83c59353419ab8472405862135c3', 'object': 'chat.completion', 'created': 1730303012, 'model': 'allam', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': ' لون النجوم في السماء يعتمد على درجة حرارة سطح النجم. النجوم الأكثر سخونة تظهر باللون الأزرق، بينما النجوم الأقل حرارة تظهر باللون الأحمر. النجوم الصفراء هي الأكثر شيوعاً وتمثل النجوم ذات درجات حرارة سطح متوسطة. إذا نظرت إلى السماء ليلاً، سترى مجموعة متنوعة من الألوان بسبب درجات الحرارة المختلفة للنجوم. '}, 'logprobs': None, 'finish_reason': 'stop', 'stop_reason': None}], 'usage': {'prompt_tokens': 15, 'total_tokens': 78, 'completion_tokens': 63}}


# Exploring the dataset

In [6]:
import datasets

In [7]:
dataset = datasets.load_dataset('arbml/AraSum')
dataset

DatasetDict({
    train: Dataset({
        features: ['index', 'summary', 'article'],
        num_rows: 49603
    })
})

In [8]:
articles = dataset['train']['article']
summaries = dataset['train']['summary']
len(articles), len(summaries)

(49603, 49603)

In [9]:
# max words, min words, avg words for summaries
max_summaries_words = max(len(text.split()) for text in summaries)
min_summaries_words = min(len(text.split()) for text in summaries)
avg_summaries_words = sum(len(text.split()) for text in summaries) / len(summaries)
max_summaries_words, min_summaries_words, avg_summaries_words

(45, 2, 33.53730621131786)

In [10]:
# max words, min words, avg words for articles
max_articles_words = max(len(text.split()) for text in articles)
min_articles_words = min(len(text.split()) for text in articles)
avg_articles_words = sum(len(text.split()) for text in articles) / len(articles)
max_articles_words, min_articles_words, avg_articles_words

(2898, 33, 376.54426143580025)

In [11]:
considered_articles,considered_summaries = [],[]
for article,summary in zip(articles,summaries):
    if 100 < len(article.split()) < 500 and 20 < len(summary.split()) < 50:
        considered_articles.append(article)
        considered_summaries.append(summary)
len(considered_articles),len(considered_summaries)

(36873, 36873)

In [12]:
articles = considered_articles[:3000]
summaries = considered_summaries[:3000]
len(articles),len(summaries)

(3000, 3000)

# Generating the dataset

first, we generate articles by polishing them.

In [13]:
def save_articles_to_jsonl(articles, file_path):  
    # Append new articles
    with open(file_path, 'w', encoding='utf-8') as f:
        for article in articles:
            json_object = article
            f.write(json.dumps(json_object, ensure_ascii=False) + '\n')

def load_articles_from_jsonl(file_path):
    articles = []
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                articles.append(json.loads(line))
    return articles

In [20]:
# Your existing code to generate articles
articles_by_polishing_prompts = []

for article in articles:
    prompt = f"""
قم بإعادة صياغة هذه المقالة التالية، مراعيا تدقيقها لغويا ونحويا وبنائيا ومعنى، المقالة كتبت لتنشر في الصحافة.
قم بإعادة الصياغة فقط دون إضافة أي تعبيرات أخرى قبل إعادة الصياغة أو بعدها
المقالة:
{article}
    """.strip()
    articles_by_polishing_prompts.append(prompt)


# Main process
!mkdir -p generated_allam_data
file_path = "generated_arabic_datasets/allam/arasum/generated_articles_from_polishing.jsonl"
generated_articles_from_polishing = load_articles_from_jsonl(file_path)

for i, prompt in tqdm(
    enumerate(
        articles_by_polishing_prompts[len(generated_articles_from_polishing) :],
        start=len(generated_articles_from_polishing),
    ),
    total=len(articles),
    initial=len(generated_articles_from_polishing)
):
    output_article = {}
    generated = get_chat_completion(
        [
            {
                "role": "user",
                "content": prompt,
            },
        ]
    )
    output_article["original_article"] = articles[i]
    output_article["original_article_summary"] = summaries[i]
    output_article["generated_article"] = generated
    generated_articles_from_polishing.append(output_article)
    # Save the generated articles
    save_articles_to_jsonl(generated_articles_from_polishing, file_path)

  0%|          | 15/3000 [00:00<?, ?it/s]