In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from groq import Groq
from time import sleep
import re
from sklearn.metrics import mean_absolute_error
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import random
random.seed(42)

In [2]:
def dprint(s, debug):
    if debug:
        print(s)

In [None]:

YOUR_GROQ_API_KEY = ''  # Get from https://console.groq.com/keys
groq_client = Groq(api_key=YOUR_GROQ_API_KEY)

def call_groq_api(prompt, student_configs, pre_processing, post_processing, model='llama3-70b-8192', debug=False):
    prompt = pre_processing(prompt)
    
    groq_params = {
        'messages': [{'role': 'user', 'content': prompt}],
        'model': model,
        'max_tokens': student_configs.get('max_tokens', 512),
        'temperature': student_configs.get('temperature', 0.7),
        'top_p': student_configs.get('top_p', 0.7),
        'stop': student_configs.get('stop', None),
    }
    
    output = groq_client.chat.completions.create(**groq_params)
    
    dprint('*****prompt*****', debug)
    dprint(prompt, debug)
    dprint('*****result*****', debug)
    res = output.choices[0].message.content
    dprint(res, debug)
    dprint('*****output*****', debug)
    labels_only = post_processing(res)
    dprint('POST PROCESSED', debug)
    dprint(labels_only, debug)
    dprint('=========', debug)
    return labels_only

In [None]:
# model_names = [
#     'togethercomputer/llama-2-7b', #LLaMa-2-7B
#     'togethercomputer/llama-2-13b', #LLaMa-2-13B
#     'togethercomputer/llama-2-70b', #LLaMa-2-70B
#     'togethercomputer/llama-2-70b-chat', #LLaMa-2-70B-Chat
# ]

In [4]:
# Most likely current Groq model names:
model_names = [
    'llama-3.1-8b-instant',     
    "llama-3.3-70b-versatile",              
]



In [5]:
def get_train_df(topn = 10):
    train_df = pd.read_excel('train_split.xlsx')
    return train_df[:topn]


In [6]:
def get_test_set(topn=10):
    test_df = pd.read_excel('test_split.xlsx')
    return test_df.sample(topn)
    

In [7]:
def get_eval_df(topn = 5):
    eval_df = pd.read_excel('dev_split.xlsx')
    return eval_df.sample(topn)
    

In [8]:
def test_range(df, prompt_configs, prompt_prefix, examples, prompt_suffix,
               pre_processing=lambda x:x, post_processing=lambda y:y,
               model='llama-3.3-70b-versatile', debug=False):
    text_ids = []
    answers = []
    model_responses = []
    corrected_model_responses = []
    text_list = []
    
    for idx, row in tqdm(df.iterrows()):
        text_ids.append(idx)  # Use index as ID if no ID column
        fixed_prompt = row['text'] + "\n"
        text_list.append(row['text'])
        fixed_prompt = pre_processing(fixed_prompt)
        prompt = prompt_prefix + examples + fixed_prompt + prompt_suffix
        answer = row['label']
        answers.append(answer)
        model_response = call_groq_api(prompt, prompt_configs, pre_processing, lambda y:y, model=model, debug=debug) 
        corrected_model_response = post_processing(model_response)
        corrected_model_responses.append(corrected_model_response)
        model_responses.append(model_response)
        sleep(1)
    
    result_df = pd.DataFrame({
        'text_id': text_ids, 
        'text': text_list, 
        'model_responses': model_responses, 
        'corrected_model_responses': corrected_model_responses, 
        'true_label': answers
    })
    return result_df

In [50]:
data=get_eval_df(50)

## Zero Shot Prompt

In [10]:
def get_all_categories():
    train_df = pd.read_excel('train_split.xlsx')
    
    all_categories = sorted(train_df['label'].unique())
    print("Found categories:", all_categories)
    print("Total categories:", len(all_categories))
    
    return all_categories

categories = get_all_categories()

Found categories: ['business', 'education', 'finance', 'health', 'medical', 'shopping', 'social', 'sports', 'tech']
Total categories: 9


In [11]:
categories_list = get_all_categories()
categories_text = ", ".join(categories_list)

prompt_prefix_zs = f'''
Classify the following Combiend Arabic and English text into one of these categories: 
{categories_text}.

Your output should only be one of these exact category names:
{chr(10).join(categories_list)}
'''

prompt_examples_zs = "Input Text: "
prompt_suffix_zs = "Output: "


Found categories: ['business', 'education', 'finance', 'health', 'medical', 'shopping', 'social', 'sports', 'tech']
Total categories: 9


In [12]:
def your_pre_processing_zs(input_string):
    return re.sub(r"@user","", input_string).strip()

def your_post_processing_zs(output_string):
    output_clean = output_string.strip().lower()
    
    if 'business' in output_clean:
        return 'business'
    elif 'shopping' in output_clean:
        return 'shopping'
    elif 'finance' in output_clean:
        return 'finance'
    elif 'education' in output_clean:
        return 'education'
    elif 'tech' in output_clean :
        return 'tech'
    elif 'technology' in output_clean or "tech" in output_clean:
        return 'tech'
    elif 'sports' in output_clean:
        return 'sports'
    elif 'medical' in output_clean :
        return 'medical'
    elif  'social' in output_clean:
        return 'social'
    else:
        return output_string.strip()  

In [13]:
prompt_config_zs = {
    'max_tokens': 3,
    'temperature': 0.4,
    'top_p': 0.7,
    'stop': []
}

model = 'llama-3.3-70b-versatile'
print(model)

eval_df = get_eval_df(50)

llama-3.3-70b-versatile


In [51]:
results_df = test_range(data, prompt_config_zs, prompt_examples_zs, prompt_prefix_zs, prompt_suffix_zs, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
print(results_df)

50it [01:42,  2.06s/it]

    text_id                                               text  \
0      9953  تعتبر pharmacodynamics من العناصر الاساسيه لفه...   
1      9774  اذا كان لديك investment portfolio متنوع فمن ال...   
2      3361  في كل مره اتابع مباراه basketball اجد نفسي مشد...   
3      5123  كيف يمكن ان تسهم biotechnology في تطوير target...   
4      1735  لكي تنجح في risk management عليك استخدام data ...   
5      4228  اذا كنت تريد الفوز في basketball عليك التركيز ...   
6      3295  تعتمد public health علي التعليم الجيد حول اهمي...   
7      4993  التكيف مع market trends هو عنصر اساسي في نجاح ...   
8      1668  للوقايه من الامراض يجب علينا فهم اهميه vaccina...   
9      2229  ما هي اساليب pedagogy التي يمكن استخدامها لتعز...   
10     7344  اذا استثمرت في employee training programs ستلا...   
11     7431  هل تعتقد ان tennis يحتاج الي new regulations ل...   
12     1244  عندما نتحدث عن family values يجب ان ناخذ بعين ...   
13     9837  عندما تختار grocery shopping عبر الانترنت تاكد...   
14     200




In [52]:
sum(results_df['corrected_model_responses']==results_df['true_label'].values)

48

In [53]:
f1_score(results_df['true_label'], results_df['corrected_model_responses'],average='weighted')

0.9573333333333334

In [54]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses'], digits=4))

              precision    recall  f1-score   support

    business     1.0000    1.0000    1.0000        10
   education     1.0000    1.0000    1.0000         5
     finance     1.0000    1.0000    1.0000         5
      health     1.0000    0.6667    0.8000         6
     medical     0.8750    1.0000    0.9333         7
    shopping     1.0000    1.0000    1.0000         6
      social     1.0000    1.0000    1.0000         3
      sports     0.8750    1.0000    0.9333         7
        tech     1.0000    1.0000    1.0000         1

    accuracy                         0.9600        50
   macro avg     0.9722    0.9630    0.9630        50
weighted avg     0.9650    0.9600    0.9573        50



In [None]:
# test_df = get_test_set()
# results_df = test_range(test_df, prompt_config_zs, prompt_examples_zs, prompt_prefix_zs, prompt_suffix_zs, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
# results_df.to_excel('zsl_test_results.xlsx', index=False)

10it [00:18,  1.87s/it]


In [56]:
#joined_df = results_df.set_index("text_id").join(test_df.set_index('text'), lsuffix='_caller', rsuffix='_other')

In [None]:
# def is_codemixed(text):
#     return bool(re.search(r'[A-Za-z]', text)) and bool(re.search(r'[\u0600-\u06FF]', text))
# joined_df['codemixed'] = joined_df['text'].apply(is_codemixed)


In [None]:
# cm = joined_df.loc[joined_df['codemixed'] == 1]

In [59]:
# cm

In [58]:
# print(classification_report(cm['true_label'], cm['corrected_model_responses'], digits=4,zero_division=0))

In [57]:
# mono = joined_df.loc[joined_df['codemixed'] == 0]


In [24]:
#Empty 
#print(classification_report(mono['true_label'], mono['corrected_model_responses'], digits=4,zero_division=0))

## Few Shot Prompting (In Context Learning)
Useful to fix output format

In [19]:
train_df = get_train_df(50)

In [17]:
def create_example(row):
    line1 = "Input Text: " + row['text'] + "\n"
    label = row['label']  
    line2 = "Output: " + label + "\n"
    return line1 + line2

In [20]:
prompt_examples_icl = ""
for idx,row in train_df.iterrows():
    ex = create_example(row)
    prompt_examples_icl += ex

In [21]:
prompt_examples_icl = prompt_examples_icl + "Input Text: "

In [22]:
categories = get_all_categories()
categories_text = ", ".join(categories)

prompt_prefix_icl = f'''
Classify the following Combined Arabic and English text into one of these categories: 
{categories_text}.

Your output should only be one of these exact category names like in the examples below.
'''

prompt_suffix_icl = "Output: "


Found categories: ['business', 'education', 'finance', 'health', 'medical', 'shopping', 'social', 'sports', 'tech']
Total categories: 9


In [61]:
results_df = test_range(data, prompt_config_zs, prompt_examples_icl, prompt_prefix_icl, prompt_suffix_icl, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
print(results_df)

50it [06:17,  7.55s/it]

    text_id                                               text  \
0      9953  تعتبر pharmacodynamics من العناصر الاساسيه لفه...   
1      9774  اذا كان لديك investment portfolio متنوع فمن ال...   
2      3361  في كل مره اتابع مباراه basketball اجد نفسي مشد...   
3      5123  كيف يمكن ان تسهم biotechnology في تطوير target...   
4      1735  لكي تنجح في risk management عليك استخدام data ...   
5      4228  اذا كنت تريد الفوز في basketball عليك التركيز ...   
6      3295  تعتمد public health علي التعليم الجيد حول اهمي...   
7      4993  التكيف مع market trends هو عنصر اساسي في نجاح ...   
8      1668  للوقايه من الامراض يجب علينا فهم اهميه vaccina...   
9      2229  ما هي اساليب pedagogy التي يمكن استخدامها لتعز...   
10     7344  اذا استثمرت في employee training programs ستلا...   
11     7431  هل تعتقد ان tennis يحتاج الي new regulations ل...   
12     1244  عندما نتحدث عن family values يجب ان ناخذ بعين ...   
13     9837  عندما تختار grocery shopping عبر الانترنت تاكد...   
14     200




In [62]:
f1_score(results_df['true_label'], results_df['corrected_model_responses'],average='weighted')

0.9173095238095237

In [63]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses'], digits=4))

              precision    recall  f1-score   support

    business     0.9091    1.0000    0.9524        10
   education     1.0000    1.0000    1.0000         5
     finance     1.0000    0.6000    0.7500         5
      health     1.0000    0.6667    0.8000         6
     medical     0.7778    1.0000    0.8750         7
    shopping     1.0000    1.0000    1.0000         6
      social     1.0000    1.0000    1.0000         3
      sports     1.0000    1.0000    1.0000         7
        tech     0.5000    1.0000    0.6667         1

    accuracy                         0.9200        50
   macro avg     0.9097    0.9185    0.8938        50
weighted avg     0.9407    0.9200    0.9173        50



In [64]:
# test_df = get_test_set()
# results_df = test_range(test_df, prompt_config_zs, prompt_examples_icl, prompt_prefix_icl, prompt_suffix_icl, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
# results_df.to_excel('icl_test_results_50ex.xlsx', index=False)

In [None]:
# joined_df = results_df.set_index("text_id").join(test_df.set_index('text'), lsuffix='_caller', rsuffix='_other')

In [None]:
# joined_df['codemixed'] = joined_df['text'].apply(is_codemixed)


In [None]:
# cm = joined_df.loc[joined_df['codemixed'] == 1]

In [65]:
# print(classification_report(cm['true_label'], cm['corrected_model_responses'], digits=4))

In [66]:
# mono= joined_df.loc[joined_df['codemixed'] == 0]

In [67]:
# print(classification_report(mono['true_label'], mono['corrected_model_responses'], digits=4))

## Chain of Thought Prompting

In [68]:
business_texts = [
    "تطوير business strategy فعاله يعتمد علي فهم market trends بشكل شامل ودقيق",
    "الشركات الناجحة تضع خطط استراتيجية واضحة لتحقيق اهدافها في السوق"
]
justification_business = [
    "This text is about business because it discusses company strategies, market analysis, and commercial planning",
    "This belongs to business category as it focuses on organizational management and market competition"
]

education_texts = [
    "في حال تم تطبيق effective curriculum development سيزيد ذلك من مستوي student engagement في الفصول الدراسيه",
    "استخدام virtual classrooms يعزز فرص collaboration بين الطلاب من مختلف انحاء العالم"
]
justification_education = [
    "This text is about education because it mentions curriculum development and student engagement in classrooms",
    "This belongs to education category as it discusses virtual learning and student collaboration"
]

finance_texts = [
    "اذا كنت تريد النجاح في trading عليك ان تتجنب emotional decisions وتركز علي market analysis",
    "تعتبر fintech من المجالات الرايده حيث تقدم حلولا مبتكره لتحسين banking services"
]
justification_finance = [
    "This text is about finance because it covers trading strategies and market analysis",
    "This belongs to finance category as it discusses fintech innovations and banking services"
]

health_texts = [
    "اذا كنت ترغب في تحسين mental health يجب عليك ممارسه mindfulness بانتظام وتناول طعام صحي",
    "ممارسة الرياضة اليومية تساعد في الحفاظ على صحة القلب والجسم بشكل عام"
]
justification_health = [
    "This text is about health because it discusses mental health, mindfulness, and healthy eating",
    "This belongs to health category as it covers physical exercise and heart health"
]

medical_texts = [
    "تعتبر nutrition السليمه اساسا لنجاح اي medical treatment لذا يجب التركيز علي الاطعمه الغنيه بالفيتامينات",
    "هل يمكن ان توثر pharmaceutical advancements علي نسبه الشفاء من الامراض المزمنه بشكل كبير"
]
justification_medical = [
    "This text is about medical because it discusses nutrition in medical treatment and vitamins",
    "This belongs to medical category as it covers pharmaceutical advancements and chronic disease treatment"
]

shopping_texts = [
    "اذا كنت ترغب في شراء consumer electronics فمن الافضل دايما مراجعه customer reviews قبل اتخاذ قرارك",
    "مقارنة الاسعار بين المتاجر المختلفة تساعد في اتخاذ قرار شراء أفضل"
]
justification_shopping = [
    "This text is about shopping because it mentions consumer electronics and customer reviews",
    "This belongs to shopping category as it discusses price comparison and purchase decisions"
]

social_texts = [
    "تفاعل الافراد في المجتمع يساهم في بناء علاقات قوية وتحسين جودة الحياة الاجتماعية",
    "المبادرات المجتمعية تلعب دوراً هاماً في حل المشكلات الاجتماعية وتعزيز التكافل"
]
justification_social = [
    "This text is about social because it discusses community interactions and social relationships",
    "This belongs to social category as it covers community initiatives and social problem-solving"
]

sports_texts = [
    "اللاعبون المميزون في cricket يعرفون كيف يستخدمون strategic planning للفوز بالمباريات المهمه",
    "التدريب المستمر والالتزام بالبرنامج الرياضي اساسي لتحقيق النتائج في المسابقات"
]
justification_sports = [
    "This text is about sports because it mentions cricket players and strategic planning in games",
    "This belongs to sports category as it discusses continuous training and sports competitions"
]

tech_texts = [
    "اذا استثمرت الشركات في cloud computing ستتمكن من تحسين كفاءه عملياتها وتقليل التكاليف",
    "الذكاء الاصطناعي يساهم في تطوير قطاعات متعددة من الصناعة والخدمات"
]
justification_tech = [
    "This text is about tech because it covers cloud computing and operational efficiency",
    "This belongs to tech category as it discusses artificial intelligence and industry development"
]

In [69]:
import random

def create_examples(all_texts, all_justifications, all_labels):
    examples = []

    combined_data = list(zip(all_texts, all_justifications, all_labels))
    random.shuffle(combined_data)

    for text, justification, label in combined_data:
        line1 = "Input Text: " + text + "\n"
        justification_line = "Justification: " + justification + "\n"
        line2 = "Output: " + label + "\n"
        examples.append(line1 + justification_line + line2)

    return ''.join(examples)

In [70]:
all_topic_texts = (business_texts + education_texts + finance_texts + 
                   health_texts + medical_texts + shopping_texts + 
                   social_texts + sports_texts + tech_texts)

all_topic_justifications = (justification_business + justification_education + justification_finance +
                           justification_health + justification_medical + justification_shopping +
                           justification_social + justification_sports + justification_tech)

all_topic_labels = (['business'] * len(business_texts) + 
                   ['education'] * len(education_texts) +
                   ['finance'] * len(finance_texts) +
                   ['health'] * len(health_texts) +
                   ['medical'] * len(medical_texts) +
                   ['shopping'] * len(shopping_texts) +
                   ['social'] * len(social_texts) +
                   ['sports'] * len(sports_texts) +
                   ['tech'] * len(tech_texts))

In [71]:
prompt_examples_cot = create_examples(all_topic_texts, all_topic_justifications, all_topic_labels) + "Input Text: "

In [72]:
print(prompt_examples_cot)

Input Text: مقارنة الاسعار بين المتاجر المختلفة تساعد في اتخاذ قرار شراء أفضل
Justification: This belongs to shopping category as it discusses price comparison and purchase decisions
Output: shopping
Input Text: اللاعبون المميزون في cricket يعرفون كيف يستخدمون strategic planning للفوز بالمباريات المهمه
Justification: This text is about sports because it mentions cricket players and strategic planning in games
Output: sports
Input Text: اذا كنت ترغب في شراء consumer electronics فمن الافضل دايما مراجعه customer reviews قبل اتخاذ قرارك
Justification: This text is about shopping because it mentions consumer electronics and customer reviews
Output: shopping
Input Text: تعتبر nutrition السليمه اساسا لنجاح اي medical treatment لذا يجب التركيز علي الاطعمه الغنيه بالفيتامينات
Justification: This text is about medical because it discusses nutrition in medical treatment and vitamins
Output: medical
Input Text: التدريب المستمر والالتزام بالبرنامج الرياضي اساسي لتحقيق النتائج في المسابقات
Justifica

In [73]:
prompt_prefix_cot = f'''
Classify the following Combined Arabic and English text into one of these categories: 
{", ".join(categories)}.

Analyze the text and provide a justification before giving the final category.
Your output should include a justification and then the category name like in the examples below.
'''

prompt_suffix_cot = "Justification: "

In [74]:
def your_pre_processing_cot(input_string):
    return input_string.strip()  

def your_post_processing_cot(output_string):
    categories = ['business', 'education', 'finance', 'health', 'medical', 'shopping', 'social', 'sports', 'tech']
    output_lower = output_string.lower()
    
    for category in categories:
        if category in output_lower:
            return category
    
    return output_string.split()[0] if output_string.strip() else 'unknown'

In [75]:
prompt_config_cot = {
    'max_tokens': 80,  
    'temperature': 0.2, 
    'top_k': 0,       
    'top_p': 0.9,
    'repetition_penalty': 1,
    'stop': ['\nCategory:'] 
}
model = 'llama-3.3-70b-versatile'
print(model)

eval_df = get_eval_df(50)

llama-3.3-70b-versatile


In [78]:
results_df = test_range(data, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)
print(results_df)

50it [03:33,  4.27s/it]

    text_id                                               text  \
0      9953  تعتبر pharmacodynamics من العناصر الاساسيه لفه...   
1      9774  اذا كان لديك investment portfolio متنوع فمن ال...   
2      3361  في كل مره اتابع مباراه basketball اجد نفسي مشد...   
3      5123  كيف يمكن ان تسهم biotechnology في تطوير target...   
4      1735  لكي تنجح في risk management عليك استخدام data ...   
5      4228  اذا كنت تريد الفوز في basketball عليك التركيز ...   
6      3295  تعتمد public health علي التعليم الجيد حول اهمي...   
7      4993  التكيف مع market trends هو عنصر اساسي في نجاح ...   
8      1668  للوقايه من الامراض يجب علينا فهم اهميه vaccina...   
9      2229  ما هي اساليب pedagogy التي يمكن استخدامها لتعز...   
10     7344  اذا استثمرت في employee training programs ستلا...   
11     7431  هل تعتقد ان tennis يحتاج الي new regulations ل...   
12     1244  عندما نتحدث عن family values يجب ان ناخذ بعين ...   
13     9837  عندما تختار grocery shopping عبر الانترنت تاكد...   
14     200




In [79]:
results_df.to_excel('error_analysis.xlsx', index=False)

In [80]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [81]:
f1_score(results_df['true_label'], results_df['corrected_model_responses'],average='weighted')

0.9192485292485293

In [82]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses']))

              precision    recall  f1-score   support

    business       0.91      1.00      0.95        10
   education       0.83      1.00      0.91         5
     finance       1.00      0.80      0.89         5
      health       0.71      0.83      0.77         6
     medical       1.00      0.71      0.83         7
    shopping       1.00      1.00      1.00         6
      social       1.00      1.00      1.00         3
      sports       1.00      1.00      1.00         7
        tech       1.00      1.00      1.00         1

    accuracy                           0.92        50
   macro avg       0.94      0.93      0.93        50
weighted avg       0.93      0.92      0.92        50



## CARP

In [83]:
category_features = {
    'business': ['company strategies', 'market analysis', 'commercial planning', 'business growth', 'market trends'],
    'education': ['learning', 'teaching', 'curriculum', 'students', 'school', 'education', 'classroom', 'virtual learning'],
    'finance': ['money', 'investment', 'trading', 'banking', 'financial', 'markets', 'fintech', 'economic'],
    'health': ['fitness', 'wellness', 'mental health', 'exercise', 'healthy lifestyle', 'physical health'],
    'medical': ['treatment', 'medicine', 'pharmaceutical', 'medical care', 'doctors', 'nutrition', 'vitamins'],
    'shopping': ['purchase', 'buying', 'products', 'reviews', 'price comparison', 'consumer', 'customer'],
    'social': ['community', 'relationships', 'social interactions', 'society', 'social life', 'community initiatives'],
    'sports': ['athletes', 'games', 'training', 'competitions', 'sports', 'cricket', 'players', 'matches'],
    'tech': ['technology', 'software', 'computers', 'AI', 'digital', 'innovation', 'cloud computing', 'artificial intelligence']
}

In [84]:
def create_carp_examples_format(all_texts, all_justifications, all_labels, category_features):

    examples = []
    combined_data = list(zip(all_texts, all_justifications, all_labels))
    
    example_subset = combined_data[:6] 
    
    for text, justification, label in example_subset:
        features = category_features.get(label, [])
        
        carp_justification = f"{justification} CARP Reasoning: Text contains features like {', '.join(features[:2])} confirming {label} category."
        
        line1 = "Input Text: " + text + "\n"
        justification_line = "Justification: " + carp_justification + "\n"
        line2 = "Output: " + label + "\n"
        examples.append(line1 + justification_line + line2)

    return ''.join(examples)

prompt_examples_carp = create_carp_examples_format(all_topic_texts, all_topic_justifications, all_topic_labels, category_features)

In [85]:
prompt_prefix_carp = f"""
Classify the following Combined Arabic and English text into one of these categories:
{", ".join(category_features.keys())}.

Use the CARP method internally (analyze topic, list expected features, verify features, decide), 
but **do NOT show your reasoning**.

Output format: Only write the final category name with no explanation, no extra text, and no punctuation.
"""
prompt_suffix_carp = "Justification: "

In [86]:
def your_pre_processing_carp(input_string):
    return input_string.strip()

def your_post_processing_carp(output_string):

    categories = list(category_features.keys())
    output_lower = output_string.lower()
    
    if "output:" in output_lower:
        output_part = output_lower.split("output:")[1].strip()
        for category in categories:
            if category in output_part.split()[0]:  
                return category
    
    for category in categories:
        if f" {category} " in f" {output_lower} ":
            return category
    
    return output_string.split()[0] if output_string.strip() else 'unknown'

In [87]:
prompt_config_carp = {
    'max_tokens': 150,  
    'temperature': 0.1,  
    'top_k': 0,       
    'top_p': 0.9,
    'repetition_penalty': 1.1, 
    'stop': ['\n---', 'Input Text:']
}

In [88]:
model = 'llama-3.3-70b-versatile'
eval_df = get_eval_df(50)


In [89]:
results_df = test_range(data, prompt_config_carp, prompt_examples_carp, prompt_prefix_carp, prompt_suffix_carp, pre_processing=your_pre_processing_carp, post_processing=your_post_processing_carp, model=model, debug=False)
print(results_df)

50it [01:47,  2.14s/it]

    text_id                                               text  \
0      9953  تعتبر pharmacodynamics من العناصر الاساسيه لفه...   
1      9774  اذا كان لديك investment portfolio متنوع فمن ال...   
2      3361  في كل مره اتابع مباراه basketball اجد نفسي مشد...   
3      5123  كيف يمكن ان تسهم biotechnology في تطوير target...   
4      1735  لكي تنجح في risk management عليك استخدام data ...   
5      4228  اذا كنت تريد الفوز في basketball عليك التركيز ...   
6      3295  تعتمد public health علي التعليم الجيد حول اهمي...   
7      4993  التكيف مع market trends هو عنصر اساسي في نجاح ...   
8      1668  للوقايه من الامراض يجب علينا فهم اهميه vaccina...   
9      2229  ما هي اساليب pedagogy التي يمكن استخدامها لتعز...   
10     7344  اذا استثمرت في employee training programs ستلا...   
11     7431  هل تعتقد ان tennis يحتاج الي new regulations ل...   
12     1244  عندما نتحدث عن family values يجب ان ناخذ بعين ...   
13     9837  عندما تختار grocery shopping عبر الانترنت تاكد...   
14     200




In [90]:
results_df.to_excel('carp_analysis.xlsx', index=False)

In [91]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_carp)

In [92]:
f1_score(results_df['true_label'], results_df['corrected_model_responses'],average='weighted')

0.9181991341991342

In [93]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses']))

              precision    recall  f1-score   support

    business       1.00      1.00      1.00        10
   education       1.00      1.00      1.00         5
     finance       1.00      1.00      1.00         5
      health       0.62      0.83      0.71         6
     medical       1.00      0.57      0.73         7
    shopping       1.00      1.00      1.00         6
      social       1.00      1.00      1.00         3
      sports       0.88      1.00      0.93         7
        tech       1.00      1.00      1.00         1

    accuracy                           0.92        50
   macro avg       0.94      0.93      0.93        50
weighted avg       0.94      0.92      0.92        50



# self consistency

In [94]:
def majority_vote(predictions):
    from collections import Counter
    
    if not predictions:
        return 'unknown'    
    prediction_counts = Counter(predictions)
    most_common = prediction_counts.most_common()
    
    if len(most_common) == 1:
        return most_common[0][0]
    else:
        return most_common[0][0]

def self_consistency_post_processing(output_string):

    categories = ['business', 'education', 'finance', 'health', 'medical', 
                 'shopping', 'social', 'sports', 'tech']
    
    output_lower = output_string.lower()
    final_answer_patterns = [
        'final answer:',
        'answer:',
        'category:',
        'classification:'
    ]
    
    for pattern in final_answer_patterns:
        if pattern in output_lower:
            parts = output_lower.split(pattern, 1)
            if len(parts) > 1:
                answer_text = parts[1].strip()
                for category in categories:
                    if category in answer_text.split()[0] if answer_text else False:
                        return category
    
    for category in categories:
        if f" {category} " in f" {output_lower} ":
            return category
    
    return output_string.split()[0] if output_string.strip() else 'unknown'

In [95]:
def self_consistency_prompting(df, prompt_config, prompt_prefix, examples, prompt_suffix,
                             pre_processing=lambda x:x, post_processing=lambda y:y,
                             model='llama-3.3-70b-versatile', 
                             num_samples=3, 
                             debug=False):

    
    text_ids = []
    answers = []
    all_model_responses = []  
    final_predictions = []    
    text_list = []
    individual_predictions_list = []
    
    for idx, row in tqdm(df.iterrows()):
        text_ids.append(idx)
        fixed_prompt = row['text'] + "\n"
        text_list.append(row['text'])
        fixed_prompt = pre_processing(fixed_prompt)
        
        reasoning_prompt = prompt_prefix + examples + fixed_prompt + prompt_suffix
        
        answer = row['label']
        answers.append(answer)
        
        reasoning_samples = []
        predictions = []
        
        for sample_num in range(num_samples):
            sample_config = prompt_config.copy()
            sample_config['temperature'] = min(0.8, prompt_config.get('temperature', 0.7) + 0.1 * sample_num)
            
            model_response = call_groq_api(
                reasoning_prompt, 
                sample_config, 
                pre_processing, 
                lambda y: y, 
                model=model, 
                debug=debug
            )
            
            reasoning_samples.append(model_response)
            
            prediction = post_processing(model_response)
            predictions.append(prediction)
            
            sleep(0.5)  
        
        final_prediction = majority_vote(predictions)
        all_model_responses.append(reasoning_samples)
        individual_predictions_list.append(predictions)
        final_predictions.append(final_prediction)
        
        sleep(1)  
    
    result_df = pd.DataFrame({
        'text_id': text_ids, 
        'text': text_list, 
        'all_reasoning_samples': all_model_responses,  
        'individual_predictions': individual_predictions_list,
        'corrected_model_responses': final_predictions, 
        'true_label': answers
    })
    return result_df

In [96]:
def create_self_consistency_prompt():

    categories = get_all_categories()
    categories_text = ", ".join(categories)
    
    prompt_prefix_sc = f"""
Classify the following Combined Arabic and English text into one of these categories:
{categories_text}.

Analyze the text and decide the most suitable category.

Output format: Only write the final category name with no explanation, no extra text, and no punctuation.
"""

    
    examples_sc = '''
Example 1:
Input Text: تطوير business strategy فعاله يعتمد علي فهم market trends بشكل شامل
Reasoning: This text discusses business strategy development and market trends analysis, which are core business concepts. The keywords "business strategy" and "market trends" clearly indicate this belongs to business category.
Final Answer: business

Example 2:
Input Text: اذا كنت ترغب في تحسين mental health يجب عليك ممارسه mindfulness بانتظام
Reasoning: The text focuses on mental health improvement through mindfulness practice. Keywords like "mental health" and "mindfulness" are health-related topics about psychological well-being.
Final Answer: health

Example 3:
Input Text: استخدام machine learning في data analysis يحسن دقة التنبؤات
Reasoning: This text discusses machine learning and data analysis for improving prediction accuracy. These are technical concepts related to technology and artificial intelligence.
Final Answer: tech
'''
    
    prompt_suffix_sc = "Reasoning: "
    
    return prompt_prefix_sc, examples_sc, prompt_suffix_sc

prompt_config_sc = {
    'max_tokens': 150, 
    'temperature': 0.7,  
    'top_p': 0.9,
    'stop': ['\nInput Text:', 'Example:']  # Stop conditions
}

In [99]:

prompt_prefix_sc, prompt_examples_sc, prompt_suffix_sc = create_self_consistency_prompt()

model = 'llama-3.3-70b-versatile'
print(model)

eval_df = get_eval_df(50)  

results_df = self_consistency_prompting(
    data, 
    prompt_config_sc, 
    prompt_examples_sc, 
    prompt_prefix_sc, 
    prompt_suffix_sc,
    pre_processing=your_pre_processing_zs,
    post_processing=self_consistency_post_processing,
    model=model,
    num_samples=3, 
    debug=False
)

print(results_df[['text_id', 'text', 'corrected_model_responses', 'true_label']])

Found categories: ['business', 'education', 'finance', 'health', 'medical', 'shopping', 'social', 'sports', 'tech']
Total categories: 9
llama-3.3-70b-versatile


50it [06:54,  8.30s/it]

    text_id                                               text  \
0      9953  تعتبر pharmacodynamics من العناصر الاساسيه لفه...   
1      9774  اذا كان لديك investment portfolio متنوع فمن ال...   
2      3361  في كل مره اتابع مباراه basketball اجد نفسي مشد...   
3      5123  كيف يمكن ان تسهم biotechnology في تطوير target...   
4      1735  لكي تنجح في risk management عليك استخدام data ...   
5      4228  اذا كنت تريد الفوز في basketball عليك التركيز ...   
6      3295  تعتمد public health علي التعليم الجيد حول اهمي...   
7      4993  التكيف مع market trends هو عنصر اساسي في نجاح ...   
8      1668  للوقايه من الامراض يجب علينا فهم اهميه vaccina...   
9      2229  ما هي اساليب pedagogy التي يمكن استخدامها لتعز...   
10     7344  اذا استثمرت في employee training programs ستلا...   
11     7431  هل تعتقد ان tennis يحتاج الي new regulations ل...   
12     1244  عندما نتحدث عن family values يجب ان ناخذ بعين ...   
13     9837  عندما تختار grocery shopping عبر الانترنت تاكد...   
14     200




In [100]:
results_df.to_excel('self_analysis.xlsx', index=False)

In [101]:
f1_score(results_df['true_label'], results_df['corrected_model_responses'],average='weighted')

0.9822222222222221

In [102]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses']))

              precision    recall  f1-score   support

    business       1.00      1.00      1.00        10
   education       1.00      1.00      1.00         5
     finance       1.00      0.80      0.89         5
      health       1.00      1.00      1.00         6
     medical       1.00      1.00      1.00         7
    shopping       1.00      1.00      1.00         6
      social       1.00      1.00      1.00         3
      sports       1.00      1.00      1.00         7
        tech       0.50      1.00      0.67         1

    accuracy                           0.98        50
   macro avg       0.94      0.98      0.95        50
weighted avg       0.99      0.98      0.98        50



## Test Set Inference

In [None]:
test_df = get_test_set()

In [None]:
results_df = test_range(test_df, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)

10it [00:14,  1.50s/it]


In [None]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [None]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses']))

              precision    recall  f1-score   support

    business       0.00      0.00      0.00         0
   education       1.00      1.00      1.00         1
      health       1.00      1.00      1.00         1
     medical       1.00      1.00      1.00         1
    shopping       1.00      1.00      1.00         1
      social       1.00      1.00      1.00         2
      sports       1.00      1.00      1.00         1
        tech       1.00      0.33      0.50         3

    accuracy                           0.80        10
   macro avg       0.88      0.79      0.81        10
weighted avg       1.00      0.80      0.85        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
results_df.to_excel('test_results.xlsx', index=False)

In [None]:
results_df = test_range(test_df, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)

8it [00:24,  3.03s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01k77w45vhfccawyrk59k06nvc` service tier `on_demand` on tokens per day (TPD): Limit 100000, Used 99046, Requested 1041. Please try again in 1m15.168s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}

In [None]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [None]:
results_df = pd.read_excel('test_results.xlsx')

In [None]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses'], digits=4))

              precision    recall  f1-score   support

   education     1.0000    0.5000    0.6667         2
      health     1.0000    1.0000    1.0000         5
     medical     1.0000    1.0000    1.0000         1
    shopping     1.0000    1.0000    1.0000         2
         عند     0.0000    0.0000    0.0000         0

    accuracy                         0.9000        10
   macro avg     0.8000    0.7000    0.7333        10
weighted avg     1.0000    0.9000    0.9333        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
joined_df = results_df.set_index("text_id").join(test_df.set_index('text'), lsuffix='_caller', rsuffix='_other')

In [None]:
joined_df['codemixed'] = joined_df['text'].apply(is_codemixed)
cm = joined_df.loc[joined_df['codemixed'] == 1]
mono = joined_df.loc[joined_df['codemixed'] == 0]

In [None]:
print(classification_report(mono['true_label'], mono['corrected_model_responses'], digits=4))

ValueError: max() iterable argument is empty

In [None]:
print(classification_report(cm['true_label'], cm['corrected_model_responses'], digits=4))

              precision    recall  f1-score   support

   education     1.0000    0.5000    0.6667         2
      health     1.0000    1.0000    1.0000         5
     medical     1.0000    1.0000    1.0000         1
    shopping     1.0000    1.0000    1.0000         2
         عند     0.0000    0.0000    0.0000         0

    accuracy                         0.9000        10
   macro avg     0.8000    0.7000    0.7333        10
weighted avg     1.0000    0.9000    0.9333        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## OOD Set Inference

In [None]:
# There is No Data for This
def get_ood_set(topn=10):
    ood_df = pd.read_excel('combined.xlsx')
    return ood_df.sample(topn)

In [None]:
ood_df = get_ood_set()

In [None]:
model = 'llama-3.1-8b-instant'

In [None]:
results_df = test_range(ood_df, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)

10it [00:48,  4.88s/it]


In [None]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [None]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses']))

                precision    recall  f1-score   support

Justification:       0.00      0.00      0.00         0
     bussiness       0.00      0.00      0.00         1
     education       1.00      0.67      0.80         3
       finance       1.00      1.00      1.00         1
        health       0.00      0.00      0.00         1
      shopping       1.00      1.00      1.00         3
        social       1.00      1.00      1.00         1
           عند       0.00      0.00      0.00         0
         عندما       0.00      0.00      0.00         0

      accuracy                           0.70        10
     macro avg       0.44      0.41      0.42        10
  weighted avg       0.80      0.70      0.74        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
results_df = test_range(ood_df, prompt_config_cot, prompt_examples_cot, prompt_prefix_cot, prompt_suffix_cot, pre_processing=your_pre_processing_cot, post_processing=your_post_processing_cot, model=model, debug=False)

10it [01:39,  9.96s/it]


In [None]:
results_df['corrected_model_responses'] = results_df['model_responses'].apply(your_post_processing_cot)

In [None]:
results_df.to_excel('ood_results.xlsx', index=False)

In [None]:
results_df = pd.read_excel('ood_results.xlsx')

In [None]:
print(classification_report(results_df['true_label'], results_df['corrected_model_responses'], digits=4))

                precision    recall  f1-score   support

Justification:     0.0000    0.0000    0.0000         0
     bussiness     0.0000    0.0000    0.0000         1
     education     1.0000    1.0000    1.0000         3
       finance     1.0000    1.0000    1.0000         1
        health     0.0000    0.0000    0.0000         1
      shopping     1.0000    1.0000    1.0000         3
        social     0.0000    0.0000    0.0000         1
         عندما     0.0000    0.0000    0.0000         0

      accuracy                         0.7000        10
     macro avg     0.3750    0.3750    0.3750        10
  weighted avg     0.7000    0.7000    0.7000        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
joined_df = results_df.set_index('text_id').join(ood_df.set_index('text'), lsuffix='_caller', rsuffix='_other')

In [None]:
rgn = joined_df.loc[joined_df['domain']=='religion']
gen = joined_df.loc[joined_df['domain']=='gender']
ori = joined_df.loc[joined_df['domain']=='orientation']

KeyError: 'domain'

In [None]:
print(classification_report(rgn['offense_caller'], rgn['corrected_model_responses'], digits=4))
print(classification_report(gen['offense_caller'], gen['corrected_model_responses'], digits=4))
print(classification_report(ori['offense_caller'], ori['corrected_model_responses'], digits=4))


## ZSL OOD

In [None]:
results_df = test_range(ood_df, prompt_config_zs, prompt_examples_zs, prompt_prefix_zs, prompt_suffix_zs, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)


In [None]:
results_df.to_csv('zsl_ood_results.csv', index=False)

In [None]:
joined_df = results_df.set_index('text_id').join(ood_df.set_index('id'), lsuffix='_caller', rsuffix='_other')

In [None]:
rgn = joined_df.loc[joined_df['domain']=='religion']
gen = joined_df.loc[joined_df['domain']=='gender']
ori = joined_df.loc[joined_df['domain']=='orientation']

In [None]:
print(classification_report(rgn['offense_caller'], rgn['corrected_model_responses'], digits=4))
print(classification_report(gen['offense_caller'], gen['corrected_model_responses'], digits=4))
print(classification_report(ori['offense_caller'], ori['corrected_model_responses'], digits=4))
print(classification_report(joined_df['offense_caller'], joined_df['corrected_model_responses'], digits=4))

## ICL OOD

In [None]:
results_df = test_range(ood_df, prompt_config_zs, prompt_examples_icl, prompt_prefix_icl, prompt_suffix_icl, pre_processing=your_pre_processing_zs, post_processing=your_post_processing_zs, model=model, debug=False)
results_df.to_csv('icl_ood_results_50ex.csv', index=False)

In [None]:
joined_df = results_df.set_index('text_id').join(ood_df.set_index('id'), lsuffix='_caller', rsuffix='_other')

In [None]:
rgn = joined_df.loc[joined_df['domain']=='religion']
gen = joined_df.loc[joined_df['domain']=='gender']
ori = joined_df.loc[joined_df['domain']=='orientation']

In [None]:
print(classification_report(rgn['offense_caller'], rgn['corrected_model_responses'], digits=4))
print(classification_report(gen['offense_caller'], gen['corrected_model_responses'], digits=4))
print(classification_report(ori['offense_caller'], ori['corrected_model_responses'], digits=4))
print(classification_report(joined_df['offense_caller'], joined_df['corrected_model_responses'], digits=4))