In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from groq import Groq
from time import sleep
import re
from sklearn.metrics import mean_absolute_error
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import random
random.seed(42)

In [36]:
def dprint(s, debug):
    if debug:
        print(s)

In [None]:

YOUR_GROQ_API_KEY = ''  # Get from https://console.groq.com/keys
groq_client = Groq(api_key=YOUR_GROQ_API_KEY)

def call_groq_api(prompt, student_configs, pre_processing, post_processing, model='llama3-70b-8192', debug=False):
    prompt = pre_processing(prompt)
    
    groq_params = {
        'messages': [{'role': 'user', 'content': prompt}],
        'model': model,
        'max_tokens': student_configs.get('max_tokens', 512),
        'temperature': student_configs.get('temperature', 0.7),
        'top_p': student_configs.get('top_p', 0.7),
        'stop': student_configs.get('stop', None),
    }
    
    output = groq_client.chat.completions.create(**groq_params)
    
    dprint('*****prompt*****', debug)
    dprint(prompt, debug)
    dprint('*****result*****', debug)
    res = output.choices[0].message.content
    dprint(res, debug)
    dprint('*****output*****', debug)
    labels_only = post_processing(res)
    dprint('POST PROCESSED', debug)
    dprint(labels_only, debug)
    dprint('=========', debug)
    return labels_only

In [None]:
# model_names = [
#     'togethercomputer/llama-2-7b', #LLaMa-2-7B
#     'togethercomputer/llama-2-13b', #LLaMa-2-13B
#     'togethercomputer/llama-2-70b', #LLaMa-2-70B
#     'togethercomputer/llama-2-70b-chat', #LLaMa-2-70B-Chat
# ]

In [4]:
# Most likely current Groq model names:
model_names = [
    'llama-3.1-8b-instant',     
    "llama-3.3-70b-versatile",              
]



In [78]:
def get_train_df(topn = 10):
    train_df = pd.read_excel('train_split.xlsx')
    return train_df[:topn]


In [79]:
def get_test_set(topn=10):
    test_df = pd.read_excel('test_split.xlsx')
    return test_df.sample(topn)
    

In [20]:
def get_eval_df(topn = 5):
    eval_df = pd.read_excel('dev_split.xlsx')
    return eval_df.sample(topn)
    

In [97]:
def test_range(df, prompt_configs, prompt_prefix, examples, prompt_suffix,
               pre_processing=lambda x:x, post_processing=lambda y:y,
               model='llama-3.3-70b-versatile', debug=False):
    text_ids = []
    answers = []
    model_responses = []
    corrected_model_responses = []
    text_list = []
    
    for idx, row in tqdm(df.iterrows()):
        text_ids.append(idx)  # Use index as ID if no ID column
        fixed_prompt = row['text'] + "\n"
        text_list.append(row['text'])
        fixed_prompt = pre_processing(fixed_prompt)
        prompt = prompt_prefix + examples + fixed_prompt + prompt_suffix
        answer = row['label']
        answers.append(answer)
        model_response = call_groq_api(prompt, prompt_configs, pre_processing, lambda y:y, model=model, debug=debug) 
        corrected_model_response = post_processing(model_response)
        corrected_model_responses.append(corrected_model_response)
        model_responses.append(model_response)
        sleep(1)
    
    result_df = pd.DataFrame({
        'text_id': text_ids, 
        'text': text_list, 
        'model_responses': model_responses, 
        'corrected_model_responses': corrected_model_responses, 
        'true_label': answers
    })
    return result_df

## Zero Shot Prompt

In [49]:
def get_all_categories():
    train_df = pd.read_excel('train_split.xlsx')
    
    all_categories = sorted(train_df['label'].unique())
    print("Found categories:", all_categories)
    print("Total categories:", len(all_categories))
    
    return all_categories

categories = get_all_categories()

Found categories: ['business', 'education', 'finance', 'health', 'medical', 'shopping', 'social', 'sports', 'tech']
Total categories: 9


In [72]:
categories_list = get_all_categories()
categories_text = ", ".join(categories_list)

prompt_prefix_zs = f'''
Classify the following Combiend Arabic and English text into one of these categories: 
{categories_text}.

Your output should only be one of these exact category names:
{chr(10).join(categories_list)}
'''

prompt_examples_zs = "Input Text: "
prompt_suffix_zs = "Output: "


Found categories: ['business', 'education', 'finance', 'health', 'medical', 'shopping', 'social', 'sports', 'tech']
Total categories: 9


In [57]:
def your_pre_processing_zs(input_string):
    return re.sub(r"@user","", input_string).strip()

def your_post_processing_zs(output_string):
    output_clean = output_string.strip().lower()
    
    if 'business' in output_clean:
        return 'business'
    elif 'shopping' in output_clean:
        return 'shopping'
    elif 'finance' in output_clean:
        return 'finance'
    elif 'education' in output_clean:
        return 'education'
    elif 'tech' in output_clean :
        return 'tech'
    elif 'technology' in output_clean or "tech" in output_clean:
        return 'tech'
    elif 'sports' in output_clean:
        return 'sports'
    elif 'medical' in output_clean :
        return 'medical'
    elif  'social' in output_clean:
        return 'social'
    else:
        return output_string.strip()  

In [73]:
prompt_config_zs = {
    'max_tokens': 3,
    'temperature': 0.4,
    'top_p': 0.7,
    'stop': []
}

model = 'llama-3.3-70b-versatile'
print(model)

eval_df = get_eval_df(50)

llama-3.3-70b-versatile


In [74]:
results_df = test_range(eval_df, prompt_config_zs, prompt_examples_zs, prompt_prefix_zs, prompt_suffix_zs, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
print(results_df)

50it [01:43,  2.07s/it]

    text_id                                               text  \
0      1139  هل تعتقد ان fintech innovations ستغير الطريقه ...   
1      5759  عند اعداد corporate strategy يجب ان ناخذ بعين ...   
2       981  اذا كنت تستخدم discount codes يمكنك توفير الكث...   
3      6584  عندما تشتري عبر الانترنت يجب ان تبحث عن best d...   
4      5372  اذا تم اكتشاف مرض مبكرا يمكن ان يودي ذلك الي b...   
5       114  قم بتطوير مهاراتك في trading لتحسين فرص نجاحك ...   
6      4692  اذا لم تستثمر الشركات في digital marketing ستف...   
7      1848  في عالم medicine اليوم نحن بحاجه الي المزيد من...   
8      2280  هل جربت ecommerce platforms الجديده هناك الكثي...   
9      3087  لنحقق الاهداف يجب ان نعمل معا كفريق ونعتمد علي...   
10     2842  اذا كنت ترغب في تحسين nutrition يجب ان تتناول ...   
11     8183  يجب ان نستخدم social media platforms بشكل فعال...   
12     8206  عندما تجد exclusive offers في المتاجر يجب ان ت...   
13     2855  هل تعتقد ان blockchain technology ستغير طريقه ...   
14     802




In [75]:
sum(results_df['corrected_model_responses']==results_df['true_label'].values)

42

In [76]:
f1_score(results_df['true_label'], results_df['corrected_model_responses'],average='weighted')

0.8389865036923859

In [31]:
test_df = get_test_set()
results_df = test_range(test_df, prompt_config_zs, prompt_examples_zs, prompt_prefix_zs, prompt_suffix_zs, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
results_df.to_excel('zsl_test_results.xlsx', index=False)

10it [00:12,  1.24s/it]


In [32]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses'], digits=4))

              precision    recall  f1-score   support

   education     1.0000    1.0000    1.0000         1
     finance     1.0000    1.0000    1.0000         1
      health     0.5000    1.0000    0.6667         1
     medical     0.0000    0.0000    0.0000         2
    shopping     1.0000    1.0000    1.0000         1
      social     0.0000    0.0000    0.0000         0
      sports     0.0000    0.0000    0.0000         1
        tech     0.7500    1.0000    0.8571         3

    accuracy                         0.7000        10
   macro avg     0.5312    0.6250    0.5655        10
weighted avg     0.5750    0.7000    0.6238        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
joined_df = results_df.set_index("text_id").join(test_df.set_index('text'), lsuffix='_caller', rsuffix='_other')

In [48]:
def is_codemixed(text):
    return bool(re.search(r'[A-Za-z]', text)) and bool(re.search(r'[\u0600-\u06FF]', text))
joined_df['codemixed'] = joined_df['text'].apply(is_codemixed)


In [49]:
cm = joined_df.loc[joined_df['codemixed'] == 1]

In [53]:
cm

Unnamed: 0_level_0,text,model_responses,corrected_model_responses,true_label,label,codemixed
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8343,عندما نستخدم fintech يجب ان نكون حذرين من data...,finance,finance,finance,,True
11960,اذا كنت ترغب في تحسين صحتك يجب ان تتناول balan...,health,health,health,,True
7816,هل تعتقد ان tennis يحتاج الي المزيد من female ...,social,social,sports,,True
15573,هل تعتقد ان استخدام virtual reality يمكن ان يس...,health,health,medical,,True
20563,تطبيقات cloud computing تجعل من السهل تطوير so...,tech,tech,tech,,True
19752,الذكاء الاصطناعي يمكن ان يساعد في تحسين softwa...,tech,tech,tech,,True
17119,هل تعتقد ان machine learning يمكن ان يساعد في ...,tech,tech,tech,,True
16890,اذا اردنا تحسين curriculum development يجب ان ...,education,education,education,,True
7792,اذا تم استخدام advanced surgical techniques فا...,tech,tech,medical,,True
8347,عند الذهاب الي retail stores يجب ان تتاكد من ا...,shopping,shopping,shopping,,True


In [54]:
print(classification_report(cm['true_label'], cm['corrected_model_responses'], digits=4,zero_division=0))

              precision    recall  f1-score   support

   education     1.0000    1.0000    1.0000         1
     finance     1.0000    1.0000    1.0000         1
      health     0.5000    1.0000    0.6667         1
     medical     0.0000    0.0000    0.0000         2
    shopping     1.0000    1.0000    1.0000         1
      social     0.0000    0.0000    0.0000         0
      sports     0.0000    0.0000    0.0000         1
        tech     0.7500    1.0000    0.8571         3

    accuracy                         0.7000        10
   macro avg     0.5312    0.6250    0.5655        10
weighted avg     0.5750    0.7000    0.6238        10



In [55]:
mono = joined_df.loc[joined_df['codemixed'] == 0]


In [None]:
#Empty 
#print(classification_report(mono['true_label'], mono['corrected_model_responses'], digits=4,zero_division=0))

ValueError: max() iterable argument is empty

## Few Shot Prompting (In Context Learning)
Useful to fix output format

In [80]:
train_df = get_train_df(50)

In [81]:
def create_example(row):
    line1 = "Input Text: " + row['text'] + "\n"
    label = row['label']  
    line2 = "Output: " + label + "\n"
    return line1 + line2

In [82]:
prompt_examples_icl = ""
for idx,row in train_df.iterrows():
    ex = create_example(row)
    prompt_examples_icl += ex

In [83]:
prompt_examples_icl = prompt_examples_icl + "Input Text: "

In [86]:
categories = get_all_categories()
categories_text = ", ".join(categories)

prompt_prefix_icl = f'''
Classify the following Combined Arabic and English text into one of these categories: 
{categories_text}.

Your output should only be one of these exact category names like in the examples below.
'''

prompt_suffix_icl = "Output: "


Found categories: ['business', 'education', 'finance', 'health', 'medical', 'shopping', 'social', 'sports', 'tech']
Total categories: 9


In [87]:
results_df = test_range(eval_df, prompt_config_zs, prompt_examples_icl, prompt_prefix_icl, prompt_suffix_icl, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
print(results_df)

50it [06:19,  7.59s/it]

    text_id                                               text  \
0      1139  هل تعتقد ان fintech innovations ستغير الطريقه ...   
1      5759  عند اعداد corporate strategy يجب ان ناخذ بعين ...   
2       981  اذا كنت تستخدم discount codes يمكنك توفير الكث...   
3      6584  عندما تشتري عبر الانترنت يجب ان تبحث عن best d...   
4      5372  اذا تم اكتشاف مرض مبكرا يمكن ان يودي ذلك الي b...   
5       114  قم بتطوير مهاراتك في trading لتحسين فرص نجاحك ...   
6      4692  اذا لم تستثمر الشركات في digital marketing ستف...   
7      1848  في عالم medicine اليوم نحن بحاجه الي المزيد من...   
8      2280  هل جربت ecommerce platforms الجديده هناك الكثي...   
9      3087  لنحقق الاهداف يجب ان نعمل معا كفريق ونعتمد علي...   
10     2842  اذا كنت ترغب في تحسين nutrition يجب ان تتناول ...   
11     8183  يجب ان نستخدم social media platforms بشكل فعال...   
12     8206  عندما تجد exclusive offers في المتاجر يجب ان ت...   
13     2855  هل تعتقد ان blockchain technology ستغير طريقه ...   
14     802




In [88]:
f1_score(results_df['true_label'], results_df['corrected_model_responses'],average='weighted')

0.8971145510835913

In [65]:
test_df = get_test_set()
results_df = test_range(test_df, prompt_config_zs, prompt_examples_icl, prompt_prefix_icl, prompt_suffix_icl, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
results_df.to_excel('icl_test_results_50ex.xlsx', index=False)

10it [00:12,  1.27s/it]


In [66]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses'], digits=4))

              precision    recall  f1-score   support

   education     0.0000    0.0000    0.0000         0
      health     0.7500    1.0000    0.8571         3
     medical     1.0000    0.5000    0.6667         2
    shopping     1.0000    1.0000    1.0000         3
      social     0.0000    0.0000    0.0000         1
      sports     1.0000    1.0000    1.0000         1

    accuracy                         0.8000        10
   macro avg     0.6250    0.5833    0.5873        10
weighted avg     0.8250    0.8000    0.7905        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [68]:
joined_df = results_df.set_index("text_id").join(test_df.set_index('text'), lsuffix='_caller', rsuffix='_other')

In [69]:
joined_df['codemixed'] = joined_df['text'].apply(is_codemixed)


In [70]:
cm = joined_df.loc[joined_df['codemixed'] == 1]

In [71]:
print(classification_report(cm['true_label'], cm['corrected_model_responses'], digits=4))

              precision    recall  f1-score   support

   education     0.0000    0.0000    0.0000         0
      health     0.7500    1.0000    0.8571         3
     medical     1.0000    0.5000    0.6667         2
    shopping     1.0000    1.0000    1.0000         3
      social     0.0000    0.0000    0.0000         1
      sports     1.0000    1.0000    1.0000         1

    accuracy                         0.8000        10
   macro avg     0.6250    0.5833    0.5873        10
weighted avg     0.8250    0.8000    0.7905        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [72]:
mono= joined_df.loc[joined_df['codemixed'] == 0]

In [73]:
print(classification_report(mono['true_label'], mono['corrected_model_responses'], digits=4))

ValueError: max() iterable argument is empty

## Chain of Thought Prompting

In [8]:
business_texts = [
    "تطوير business strategy فعاله يعتمد علي فهم market trends بشكل شامل ودقيق",
    "الشركات الناجحة تضع خطط استراتيجية واضحة لتحقيق اهدافها في السوق"
]
justification_business = [
    "This text is about business because it discusses company strategies, market analysis, and commercial planning",
    "This belongs to business category as it focuses on organizational management and market competition"
]

education_texts = [
    "في حال تم تطبيق effective curriculum development سيزيد ذلك من مستوي student engagement في الفصول الدراسيه",
    "استخدام virtual classrooms يعزز فرص collaboration بين الطلاب من مختلف انحاء العالم"
]
justification_education = [
    "This text is about education because it mentions curriculum development and student engagement in classrooms",
    "This belongs to education category as it discusses virtual learning and student collaboration"
]

finance_texts = [
    "اذا كنت تريد النجاح في trading عليك ان تتجنب emotional decisions وتركز علي market analysis",
    "تعتبر fintech من المجالات الرايده حيث تقدم حلولا مبتكره لتحسين banking services"
]
justification_finance = [
    "This text is about finance because it covers trading strategies and market analysis",
    "This belongs to finance category as it discusses fintech innovations and banking services"
]

health_texts = [
    "اذا كنت ترغب في تحسين mental health يجب عليك ممارسه mindfulness بانتظام وتناول طعام صحي",
    "ممارسة الرياضة اليومية تساعد في الحفاظ على صحة القلب والجسم بشكل عام"
]
justification_health = [
    "This text is about health because it discusses mental health, mindfulness, and healthy eating",
    "This belongs to health category as it covers physical exercise and heart health"
]

medical_texts = [
    "تعتبر nutrition السليمه اساسا لنجاح اي medical treatment لذا يجب التركيز علي الاطعمه الغنيه بالفيتامينات",
    "هل يمكن ان توثر pharmaceutical advancements علي نسبه الشفاء من الامراض المزمنه بشكل كبير"
]
justification_medical = [
    "This text is about medical because it discusses nutrition in medical treatment and vitamins",
    "This belongs to medical category as it covers pharmaceutical advancements and chronic disease treatment"
]

shopping_texts = [
    "اذا كنت ترغب في شراء consumer electronics فمن الافضل دايما مراجعه customer reviews قبل اتخاذ قرارك",
    "مقارنة الاسعار بين المتاجر المختلفة تساعد في اتخاذ قرار شراء أفضل"
]
justification_shopping = [
    "This text is about shopping because it mentions consumer electronics and customer reviews",
    "This belongs to shopping category as it discusses price comparison and purchase decisions"
]

social_texts = [
    "تفاعل الافراد في المجتمع يساهم في بناء علاقات قوية وتحسين جودة الحياة الاجتماعية",
    "المبادرات المجتمعية تلعب دوراً هاماً في حل المشكلات الاجتماعية وتعزيز التكافل"
]
justification_social = [
    "This text is about social because it discusses community interactions and social relationships",
    "This belongs to social category as it covers community initiatives and social problem-solving"
]

sports_texts = [
    "اللاعبون المميزون في cricket يعرفون كيف يستخدمون strategic planning للفوز بالمباريات المهمه",
    "التدريب المستمر والالتزام بالبرنامج الرياضي اساسي لتحقيق النتائج في المسابقات"
]
justification_sports = [
    "This text is about sports because it mentions cricket players and strategic planning in games",
    "This belongs to sports category as it discusses continuous training and sports competitions"
]

tech_texts = [
    "اذا استثمرت الشركات في cloud computing ستتمكن من تحسين كفاءه عملياتها وتقليل التكاليف",
    "الذكاء الاصطناعي يساهم في تطوير قطاعات متعددة من الصناعة والخدمات"
]
justification_tech = [
    "This text is about tech because it covers cloud computing and operational efficiency",
    "This belongs to tech category as it discusses artificial intelligence and industry development"
]

In [9]:
import random

def create_examples(all_texts, all_justifications, all_labels):
    examples = []

    combined_data = list(zip(all_texts, all_justifications, all_labels))
    random.shuffle(combined_data)

    for text, justification, label in combined_data:
        line1 = "Input Text: " + text + "\n"
        justification_line = "Justification: " + justification + "\n"
        line2 = "Output: " + label + "\n"
        examples.append(line1 + justification_line + line2)

    return ''.join(examples)

In [10]:
all_topic_texts = (business_texts + education_texts + finance_texts + 
                   health_texts + medical_texts + shopping_texts + 
                   social_texts + sports_texts + tech_texts)

all_topic_justifications = (justification_business + justification_education + justification_finance +
                           justification_health + justification_medical + justification_shopping +
                           justification_social + justification_sports + justification_tech)

all_topic_labels = (['business'] * len(business_texts) + 
                   ['education'] * len(education_texts) +
                   ['finance'] * len(finance_texts) +
                   ['health'] * len(health_texts) +
                   ['medical'] * len(medical_texts) +
                   ['shopping'] * len(shopping_texts) +
                   ['social'] * len(social_texts) +
                   ['sports'] * len(sports_texts) +
                   ['tech'] * len(tech_texts))

In [11]:
prompt_examples_cot = create_examples(all_topic_texts, all_topic_justifications, all_topic_labels) + "Input Text: "

In [12]:
print(prompt_examples_cot)

Input Text: مقارنة الاسعار بين المتاجر المختلفة تساعد في اتخاذ قرار شراء أفضل
Justification: This belongs to shopping category as it discusses price comparison and purchase decisions
Output: shopping
Input Text: اللاعبون المميزون في cricket يعرفون كيف يستخدمون strategic planning للفوز بالمباريات المهمه
Justification: This text is about sports because it mentions cricket players and strategic planning in games
Output: sports
Input Text: اذا كنت ترغب في شراء consumer electronics فمن الافضل دايما مراجعه customer reviews قبل اتخاذ قرارك
Justification: This text is about shopping because it mentions consumer electronics and customer reviews
Output: shopping
Input Text: تعتبر nutrition السليمه اساسا لنجاح اي medical treatment لذا يجب التركيز علي الاطعمه الغنيه بالفيتامينات
Justification: This text is about medical because it discusses nutrition in medical treatment and vitamins
Output: medical
Input Text: التدريب المستمر والالتزام بالبرنامج الرياضي اساسي لتحقيق النتائج في المسابقات
Justifica

In [13]:
prompt_prefix_cot = f'''
Classify the following Arabic and English text into one of these categories: 
{", ".join(categories)}.

Analyze the text and provide a justification before giving the final category.
Your output should include a justification and then the category name like in the examples below.
'''

prompt_suffix_cot = "Justification: "

NameError: name 'categories' is not defined

In [None]:
def your_pre_processing_cot(input_string):
    return input_string.strip()  

def your_post_processing_cot(output_string):
    categories = ['business', 'education', 'finance', 'health', 'medical', 'shopping', 'social', 'sports', 'tech']
    output_lower = output_string.lower()
    
    for category in categories:
        if category in output_lower:
            return category
    
    return output_string.split()[0] if output_string.strip() else 'unknown'

In [None]:
prompt_config_cot = {
    'max_tokens': 80,  
    'temperature': 0.2, 
    'top_k': 0,       
    'top_p': 0.9,
    'repetition_penalty': 1,
    'stop': ['\nCategory:'] 
}
model = 'llama-3.1-8b-instant'
print(model)

eval_df = get_eval_df(50)

llama-3.1-8b-instant


In [None]:
results_df = test_range(eval_df, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)
print(results_df)

50it [07:54,  9.49s/it]

    text_id                                               text  \
0      3679  قد يساعدنا تعزيز community development في تحسي...   
1      4089  كيف يمكننا استخدام artificial intelligence لتح...   
2      8338  اذا كان لديك خطه عمل واضحه يمكنك increase your...   
3      1334  هل تعتقد ان cultural trends الحاليه تعكس التغي...   
4       444  عندما يتحدث اللاعبون عن motivation يتوجب عليهم...   
5      4397  بينما تتطور computer networks يجب ان نكون علي ...   
6      3946  اذا كانت لديك معلومات جيده عن market forecasts...   
7      5017  كيف يمكن ان توثر blockchain technology علي نظا...   
8      5586  هل لديك خطه واضحه لتعزيز employee engagement ف...   
9      1429  يجب علي المطورين استخدام software development ...   
10     2472  من المهم ان نفهم كيف تعمل artificial intellige...   
11     4108  هل تعتقد ان healthcare access يجب ان يكون حقا ...   
12     2812  حينما تفكر في retirement planning عليك ان تاخذ...   
13     5897  اذا كنت تبحث عن fashion trends فلا تنس زياره ا...   
14     726




In [None]:
results_df.to_excel('error_analysis.xlsx', index=False)

In [None]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [None]:
f1_score(results_df['true_label'], results_df['corrected_model_responses'],average='weighted')

0.9465672105672105

In [None]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses']))

                precision    recall  f1-score   support

Justification:       0.00      0.00      0.00         0
      business       1.00      1.00      1.00         7
     education       0.83      1.00      0.91         5
       finance       1.00      0.80      0.89         5
        health       0.86      1.00      0.92         6
       medical       1.00      0.67      0.80         6
      shopping       1.00      1.00      1.00         4
        social       1.00      1.00      1.00         5
        sports       1.00      1.00      1.00         6
          tech       1.00      1.00      1.00         6

      accuracy                           0.94        50
     macro avg       0.87      0.85      0.85        50
  weighted avg       0.97      0.94      0.95        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## CARP

In [89]:
category_features = {
    'business': ['company strategies', 'market analysis', 'commercial planning', 'business growth', 'market trends'],
    'education': ['learning', 'teaching', 'curriculum', 'students', 'school', 'education', 'classroom', 'virtual learning'],
    'finance': ['money', 'investment', 'trading', 'banking', 'financial', 'markets', 'fintech', 'economic'],
    'health': ['fitness', 'wellness', 'mental health', 'exercise', 'healthy lifestyle', 'physical health'],
    'medical': ['treatment', 'medicine', 'pharmaceutical', 'medical care', 'doctors', 'nutrition', 'vitamins'],
    'shopping': ['purchase', 'buying', 'products', 'reviews', 'price comparison', 'consumer', 'customer'],
    'social': ['community', 'relationships', 'social interactions', 'society', 'social life', 'community initiatives'],
    'sports': ['athletes', 'games', 'training', 'competitions', 'sports', 'cricket', 'players', 'matches'],
    'tech': ['technology', 'software', 'computers', 'AI', 'digital', 'innovation', 'cloud computing', 'artificial intelligence']
}

In [90]:
def create_carp_examples_format(all_texts, all_justifications, all_labels, category_features):

    examples = []
    combined_data = list(zip(all_texts, all_justifications, all_labels))
    
    example_subset = combined_data[:6] 
    
    for text, justification, label in example_subset:
        features = category_features.get(label, [])
        
        carp_justification = f"{justification} CARP Reasoning: Text contains features like {', '.join(features[:2])} confirming {label} category."
        
        line1 = "Input Text: " + text + "\n"
        justification_line = "Justification: " + carp_justification + "\n"
        line2 = "Output: " + label + "\n"
        examples.append(line1 + justification_line + line2)

    return ''.join(examples)

prompt_examples_carp = create_carp_examples_format(all_topic_texts, all_topic_justifications, all_topic_labels, category_features)

In [114]:
prompt_prefix_carp = f"""
Classify the following Combined Arabic and English text into one of these categories:
{", ".join(category_features.keys())}.

Use the CARP method internally (analyze topic, list expected features, verify features, decide), 
but **do NOT show your reasoning**.

Output format: Only write the final category name with no explanation, no extra text, and no punctuation.
"""
prompt_suffix_carp = "Justification: "

In [92]:
def your_pre_processing_carp(input_string):
    return input_string.strip()

def your_post_processing_carp(output_string):

    categories = list(category_features.keys())
    output_lower = output_string.lower()
    
    if "output:" in output_lower:
        output_part = output_lower.split("output:")[1].strip()
        for category in categories:
            if category in output_part.split()[0]:  
                return category
    
    for category in categories:
        if f" {category} " in f" {output_lower} ":
            return category
    
    return output_string.split()[0] if output_string.strip() else 'unknown'

In [94]:
prompt_config_carp = {
    'max_tokens': 150,  
    'temperature': 0.1,  
    'top_k': 0,       
    'top_p': 0.9,
    'repetition_penalty': 1.1, 
    'stop': ['\n---', 'Input Text:']
}

In [93]:
model = 'llama-3.3-70b-versatile'
eval_df = get_eval_df(50)


In [115]:
results_df = test_range(eval_df, prompt_config_carp, prompt_examples_carp, prompt_prefix_carp, prompt_suffix_carp, pre_processing=your_pre_processing_carp, post_processing=your_post_processing_carp, model=model, debug=False)
print(results_df)

50it [02:21,  2.82s/it]

    text_id                                               text  \
0      2728  هل تعتقد ان youth culture اليوم تتاثر اكثر بال...   
1      7946  اذا حصلت علي exclusive discounts يمكنك شراء ال...   
2      1794  عندما يتم تبني educational technology يمكن ان ...   
3      4389  تعتبر digital marketing اداه مهمه لجذب العملاء...   
4      7503  اذا استثمرت الشركات في big data technologies س...   
5      3955  عند اجراء surgery يعتبر استخدام minimally inva...   
6      2512  الطب الحديث يعتمد بشكل كبير علي advanced techn...   
7       687  في عالم cryptocurrency يجب ان نكون حذرين من ma...   
8      5283  لتحسين interpersonal relationships يجب ان نتبن...   
9       315  هل تعلم ان tennis يحتاج الي mental toughness ا...   
10     3777  لشراء افضل electronics يجب ان تقارن الاسعار عل...   
11     5503  اذا كنت ترغب في تحسين clinical outcomes يجب ال...   
12     1273  تعتبر data analytics اداه قويه في marketing st...   
13      699  عندما نتحدث عن blockchain يجب ان نفكر في كيفيه...   
14     788




In [112]:
results_df.to_excel('carp_analysis.xlsx', index=False)

In [100]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_carp)

In [116]:
f1_score(results_df['true_label'], results_df['corrected_model_responses'],average='weighted')

0.8408880661821838

In [43]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses']))

              precision    recall  f1-score   support

    business       1.00      1.00      1.00         1
     finance       1.00      1.00      1.00         1
      health       0.00      0.00      0.00         0
     medical       0.00      0.00      0.00         1
    shopping       1.00      1.00      1.00         1
      sports       1.00      1.00      1.00         1

    accuracy                           0.80         5
   macro avg       0.67      0.67      0.67         5
weighted avg       0.80      0.80      0.80         5



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Test Set Inference

In [87]:
test_df = get_test_set()

In [88]:
results_df = test_range(test_df, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)

10it [01:16,  7.63s/it]


In [89]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [90]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses']))

              precision    recall  f1-score   support

   education       1.00      0.50      0.67         2
      health       1.00      1.00      1.00         5
     medical       1.00      1.00      1.00         1
    shopping       1.00      1.00      1.00         2
         عند       0.00      0.00      0.00         0

    accuracy                           0.90        10
   macro avg       0.80      0.70      0.73        10
weighted avg       1.00      0.90      0.93        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [92]:
results_df.to_excel('test_results.xlsx', index=False)

In [93]:
results_df = test_range(test_df, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)

10it [00:49,  4.91s/it]


In [94]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [95]:
results_df = pd.read_excel('test_results.xlsx')

In [96]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses'], digits=4))

              precision    recall  f1-score   support

   education     1.0000    0.5000    0.6667         2
      health     1.0000    1.0000    1.0000         5
     medical     1.0000    1.0000    1.0000         1
    shopping     1.0000    1.0000    1.0000         2
         عند     0.0000    0.0000    0.0000         0

    accuracy                         0.9000        10
   macro avg     0.8000    0.7000    0.7333        10
weighted avg     1.0000    0.9000    0.9333        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [98]:
joined_df = results_df.set_index("text_id").join(test_df.set_index('text'), lsuffix='_caller', rsuffix='_other')

In [99]:
joined_df['codemixed'] = joined_df['text'].apply(is_codemixed)
cm = joined_df.loc[joined_df['codemixed'] == 1]
mono = joined_df.loc[joined_df['codemixed'] == 0]

In [100]:
print(classification_report(mono['true_label'], mono['corrected_model_responses'], digits=4))

ValueError: max() iterable argument is empty

In [101]:
print(classification_report(cm['true_label'], cm['corrected_model_responses'], digits=4))

              precision    recall  f1-score   support

   education     1.0000    0.5000    0.6667         2
      health     1.0000    1.0000    1.0000         5
     medical     1.0000    1.0000    1.0000         1
    shopping     1.0000    1.0000    1.0000         2
         عند     0.0000    0.0000    0.0000         0

    accuracy                         0.9000        10
   macro avg     0.8000    0.7000    0.7333        10
weighted avg     1.0000    0.9000    0.9333        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## OOD Set Inference

In [103]:
# There is No Data for This
def get_ood_set(topn=10):
    ood_df = pd.read_excel('combined.xlsx')
    return ood_df.sample(topn)

In [104]:
ood_df = get_ood_set()

In [105]:
model = 'llama-3.1-8b-instant'

In [106]:
results_df = test_range(ood_df, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)

10it [00:48,  4.88s/it]


In [107]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [108]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses']))

                precision    recall  f1-score   support

Justification:       0.00      0.00      0.00         0
     bussiness       0.00      0.00      0.00         1
     education       1.00      0.67      0.80         3
       finance       1.00      1.00      1.00         1
        health       0.00      0.00      0.00         1
      shopping       1.00      1.00      1.00         3
        social       1.00      1.00      1.00         1
           عند       0.00      0.00      0.00         0
         عندما       0.00      0.00      0.00         0

      accuracy                           0.70        10
     macro avg       0.44      0.41      0.42        10
  weighted avg       0.80      0.70      0.74        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [109]:
results_df = test_range(ood_df, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)

10it [01:39,  9.96s/it]


In [110]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [111]:
results_df.to_excel('ood_results.xlsx', index=False)

In [113]:
results_df = pd.read_excel('ood_results.xlsx')

In [114]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses'], digits=4))

                precision    recall  f1-score   support

Justification:     0.0000    0.0000    0.0000         0
     bussiness     0.0000    0.0000    0.0000         1
     education     1.0000    1.0000    1.0000         3
       finance     1.0000    1.0000    1.0000         1
        health     0.0000    0.0000    0.0000         1
      shopping     1.0000    1.0000    1.0000         3
        social     0.0000    0.0000    0.0000         1
         عندما     0.0000    0.0000    0.0000         0

      accuracy                         0.7000        10
     macro avg     0.3750    0.3750    0.3750        10
  weighted avg     0.7000    0.7000    0.7000        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [115]:
joined_df = results_df.set_index('text_id').join(ood_df.set_index('text'), lsuffix='_caller', rsuffix='_other')

In [116]:
rgn = joined_df.loc[joined_df['domain']=='religion']
gen = joined_df.loc[joined_df['domain']=='gender']
ori = joined_df.loc[joined_df['domain']=='orientation']

KeyError: 'domain'

In [None]:
print(classification_report(rgn['offense_caller'], rgn['corrected_model_responses'], digits=4))
print(classification_report(gen['offense_caller'], gen['corrected_model_responses'], digits=4))
print(classification_report(ori['offense_caller'], ori['corrected_model_responses'], digits=4))


## ZSL OOD

In [None]:
results_df = test_range(ood_df, prompt_config_zs, prompt_examples_zs, prompt_prefix_zs, prompt_suffix_zs, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)


In [None]:
results_df.to_csv('zsl_ood_results.csv', index=False)

In [None]:
joined_df = results_df.set_index('text_id').join(ood_df.set_index('id'), lsuffix='_caller', rsuffix='_other')

In [None]:
rgn = joined_df.loc[joined_df['domain']=='religion']
gen = joined_df.loc[joined_df['domain']=='gender']
ori = joined_df.loc[joined_df['domain']=='orientation']

In [None]:
print(classification_report(rgn['offense_caller'], rgn['corrected_model_responses'], digits=4))
print(classification_report(gen['offense_caller'], gen['corrected_model_responses'], digits=4))
print(classification_report(ori['offense_caller'], ori['corrected_model_responses'], digits=4))
print(classification_report(joined_df['offense_caller'], joined_df['corrected_model_responses'], digits=4))

## ICL OOD

In [None]:
results_df = test_range(ood_df, prompt_config_zs, prompt_examples_icl, prompt_prefix_icl, prompt_suffix_icl, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
results_df.to_csv('icl_ood_results_50ex.csv', index=False)

In [None]:
joined_df = results_df.set_index('text_id').join(ood_df.set_index('id'), lsuffix='_caller', rsuffix='_other')

In [None]:
rgn = joined_df.loc[joined_df['domain']=='religion']
gen = joined_df.loc[joined_df['domain']=='gender']
ori = joined_df.loc[joined_df['domain']=='orientation']

In [None]:
print(classification_report(rgn['offense_caller'], rgn['corrected_model_responses'], digits=4))
print(classification_report(gen['offense_caller'], gen['corrected_model_responses'], digits=4))
print(classification_report(ori['offense_caller'], ori['corrected_model_responses'], digits=4))
print(classification_report(joined_df['offense_caller'], joined_df['corrected_model_responses'], digits=4))