# WVQ Wave 7 - Data preprocessing process

This process is specifically for Tigrigna (Ethiopia), to process other language provide the country name, the language and the language code (abbreviation)

In [None]:
import pandas as pd

def preprocess_data_wave7(country_name, language_name, language_code):
    # Load dataset
    df = pd.read_csv(
        f'WVS_original_dataset/F00013128-WVS_Wave_7_{country_name}_Csv_v5.0.csv',
        delimiter=';' 
    )

    # Remove any duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Rename metadata columns
    column_rename_map = {
        'LNGE_ISO': 'language',
        'B_COUNTRY': 'country',
        'B_COUNTRY_ALPHA': 'country_alpha',
    }

    # Question list (excluding Q260 and Q262)
    q_list = [
        '1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','27', '28', '29', '30', '31', '32', '33', '34', '35', '37', '38', '39', '40', '41','43','44','45', '122', '123', '124', '125', '126', '127', '128', '129', '132', '133', '134', '135', '136', '137', '138','142','143', '146', '147', '148','152','158', '159','160','161', '162', '169','170','224', '225', '226', '227', '228', '229', '230', '231', '232', '233','234','235'
    ]

    q_columns = [f'Q{num}' for num in q_list]
    required_columns = list(column_rename_map.keys()) + [col for col in q_columns if col in df.columns]

    # Subset and rename
    df_filtered = df[required_columns].copy()
    df_filtered.rename(columns=column_rename_map, inplace=True)
    df_filtered = df_filtered.loc[:, ~df_filtered.columns.duplicated()]

    # Group by country, country_alpha, and language, and calculate the average of the questions
    group_columns = ['country', 'country_alpha', 'language']
    aggregated_df = df_filtered.groupby(group_columns).mean(numeric_only=True).round().astype(int).reset_index()

    Tigrigna_df = aggregated_df[aggregated_df['language'] == language_code]

    Tigrigna_df.to_csv(f'data/{language_name}/WVQ_{language_name}.csv', index=False)

    print('success!')

preprocess_data_wave7('Ethiopia', 'Tigrigna', 'tig')

# WVS Wave 6 - Data preprocessing process

### WVS Wave 6 South Africa dataset

In [2]:
# Wave 6 South Africa
import pandas as pd

pd.options.display.max_columns = None
df = pd.read_csv('/Users/Jamie/AfricaLLM/WVS_original_dataset/WV6_Data_South_Africa_Csv_v20221117.1.csv', delimiter=';', index_col=False)

# Remove any duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# Rename metadata columns *before* using the new names
column_rename_map = {
    'C_COW_ALPHA': 'country_alpha',
    'V2': 'country',
    'V247': 'language'
}
df.rename(columns=column_rename_map, inplace=True)

# Now 'language' exists as a column
# Combine Northern and Southern Sotho entries by mapping both to 'Sotho'
df['language'] = df['language'].replace({
    3965: 'Sotho',  # Northern Sotho
    3985: 'Sotho'   # Southern Sotho
})

language_map = {
    40: 'Afrikaans',
    1240: 'English',
    3232: 'Ndebele',
    4090: 'Swazi',
    4335: 'Tsonga/Shangaan',
    4340: 'Tswana',
    4460: 'Venda',
    4570: 'Xhosa',
    4660: 'Zulu',
    9000: 'Other',
    'Sotho': 'Sotho'  # To keep 'Sotho' as-is
}
df['language'] = df['language'].map(language_map)

v_list = [
    '4','5','6','7','8','9','24','45','46','47','48','49','50','51','52','53','54','67','68','69','70','71','72','73','74','75','76','77','78','79','102','103','104','105','106','107','127','128','129','130','142','143','171','172','173','174','175','181','182','183','184','185','186','187','188','189','190','191','211','212','216','228A','228B','228C','228D','228E','228F','228G','228H','228I','228J','228K'
]

v_columns = [f'V{num}' for num in v_list]
required_columns = ['country_alpha', 'country', 'language'] + [col for col in v_columns if col in df.columns]

# Subset
df_filtered = df[required_columns].copy()
df_filtered = df_filtered.loc[:, ~df_filtered.columns.duplicated()]

# Replace invalid country codes
df_filtered['country_alpha'] = df_filtered['country_alpha'].replace({-4: 'ZAF'})

# Group and average, round to nearest int
group_columns = ['country', 'country_alpha', 'language']
aggregated_df = df_filtered.groupby(group_columns).mean(numeric_only=True).round().astype(int).reset_index()

aggregated_df.head(12)

sotho_df = aggregated_df[aggregated_df['language'] == 'Sotho']
afrikaans_df = aggregated_df[aggregated_df['language'] == 'Afrikaans']
tswana_df = aggregated_df[aggregated_df['language'] == 'Tswana']
xhosa_df = aggregated_df[aggregated_df['language'] == 'Xhosa']
zulu_df = aggregated_df[aggregated_df['language'] == 'Zulu']

sotho_df.to_csv(f'data/Sotho/WVQ_Sotho.csv', index=False)
afrikaans_df.to_csv(f'data/Afrikaans/WVQ_Afrikaans.csv', index=False)
tswana_df.to_csv(f'data/Tswana/WVQ_Tswana.csv', index=False)
xhosa_df.to_csv(f'data/Xhosa/WVQ_Xhosa.csv', index=False)
zulu_df.to_csv(f'data/Zulu/WVQ_Zulu.csv', index=False)

### WVS Wave 6 Rwanda dataset

In [None]:

import pandas as pd

pd.options.display.max_columns = None

# Load Rwanda data
df = pd.read_csv('WVS_original_dataset/WV6_Data_Rwanda_Csv_v20221117.1.csv', delimiter=';', index_col=False)

# Remove duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# Rename columns
column_rename_map = {
    'C_COW_ALPHA': 'country_alpha',
    'V2': 'country',
    'V247': 'language'
}

language_map = {
    605: 'Afrikaans',
    1240: 'English',
    3232: 'Ndebele',
    4090: 'Swazi',
    4335: 'Tsonga/Shangaan',
    4340: 'Tswana',
    4460: 'Venda',
    4570: 'Xhosa',
    4660: 'Zulu',
    9000: 'Other',
    'Sotho': 'Sotho'  # To keep 'Sotho' as-is
}

df.rename(columns=column_rename_map, inplace=True)

v_list = [
'24','45','46','47','48','49','50','51','52','53','54','67','68','69','70','71','72','73','74','75','76','77','78','79','102','103','104','105','106','107','127','128','129','130','142','143','171','172','173','174','175','181','182','183','184','185','186','187','188','189','190','191','211','212','216','228A','228B','228C','228D','228E','228F','228G','228H','228I','228J','228K'
]

v_columns = [f'V{num}' for num in v_list]
required_columns = ['country_alpha', 'country', 'language'] + [col for col in v_columns if col in df.columns]

# Filter and deduplicate columns
df_filtered = df[required_columns].copy()
df_filtered = df_filtered.loc[:, ~df_filtered.columns.duplicated()]

# Optionally, fix country code
df_filtered['country_alpha'] = df_filtered['country_alpha'].replace({-4: 'RWA'})

# Group and aggregate
group_columns = ['country', 'country_alpha', 'language']
aggregated_df = df_filtered.groupby(group_columns).mean(numeric_only=True).round().astype(int).reset_index()

Ndebele_df = aggregated_df[aggregated_df['language'] == 'Ndebele']

Ndebele_df.to_csv('WVQ_Ndebele.csv', index=False)


Unnamed: 0,country,country_alpha,language,V4,V5,V6,V7,V8,V9,V24,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V66,V67,V68,V69,V70,V71,V72,V73,V74,V75,V76,V77,V78,V79,V102,V103,V104,V105,V106,V107,V127,V128,V129,V130,V142,V143,V144,V171,V172,V173,V174,V175,V176,V177,V178,V181,V182,V183,V184,V185,V186,V187,V188,V189,V190,V191,V211,V212,V216,V228A,V228B,V228C,V228D,V228E,V228F,V228G,V228H,V228I,V228J,V228K
0,646.0,RWA,1240.0,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,3,2,1,4,3,3,5,2,4,6,3,4,4,1,1,1,3,2,2,2,2,2,1,2,2,50000000,4,4,2,4,2,5,5,5,0,0,0,0,0,2,2,4,4,4,2,2,3,2,0,-1,-1,-1,-1,-1,-1,-1,-1,2,2
1,646.0,RWA,1400.0,1,2,2,2,2,2,2,2,2,2,2,2,3,2,3,3,2,1,3,2,2,3,3,3,4,3,3,3,3,3,3,2,2,2,2,2,3,2,3,3,2,2,2,44347833,2,3,3,3,3,4,4,5,1,1,1,1,1,2,2,2,3,3,2,1,2,2,2,3,2,2,2,2,2,2,3,2,2
2,646.0,RWA,2300.0,1,1,2,2,1,2,2,2,2,2,2,2,3,2,3,3,3,1,3,1,1,3,3,3,5,3,3,3,3,3,3,1,2,2,3,3,3,3,3,2,1,2,2,29283761,3,2,2,2,2,4,4,5,1,2,1,1,1,2,2,2,3,2,2,1,2,2,1,1,1,1,1,1,1,1,1,1,2
3,646.0,RWA,4075.0,1,1,2,3,1,2,2,2,2,2,2,2,3,3,3,3,2,1,3,1,2,3,3,3,4,3,3,4,4,3,3,1,2,2,3,3,3,3,3,2,1,2,2,15000000,3,2,2,3,2,5,4,5,1,2,1,1,1,2,2,2,3,2,2,1,1,2,1,2,2,1,2,2,1,2,2,2,2


### WVS Wave 6 Ghana dataset

In [6]:
# Wave 6 Ghana
import pandas as pd

pd.options.display.max_columns = None

# Load Rwanda data
df = pd.read_csv('WVS_original_dataset/WV6_Data_Ghana_Csv_v20221117.1.csv', delimiter=';', index_col=False)

# Remove duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# Rename columns
column_rename_map = {
    'C_COW_ALPHA': 'country_alpha',
    'V2': 'country',
    'V247': 'language'
}

df.rename(columns=column_rename_map, inplace=True)

v_list = [
    '4','5','6','7','8','9','24','45','46','47','48','49','50','51','52','53','54','67','68','69','70','71','72','73','74','75','76','77','78','79','102','103','104','105','106','107','127','128','129','130','142','143','171','172','173','174','175','181','182','183','184','185','186','187','188','189','190','191','211','212','216','228A','228B','228C','228D','228E','228F','228G','228H','228I','228J','228K'
]

language_map = {
    605: 'Twi',
    1080: 'Ewe', 
}

df['language'] = df['language'].map(language_map)

v_columns = [f'V{num}' for num in v_list]
required_columns = ['country_alpha', 'country', 'language'] + [col for col in v_columns if col in df.columns]

# Filter and deduplicate columns
df_filtered = df[required_columns].copy()
df_filtered = df_filtered.loc[:, ~df_filtered.columns.duplicated()]

# Optionally, fix country code
df_filtered['country_alpha'] = df_filtered['country_alpha'].replace({-4: 'GHA'})

# Group and aggregate
group_columns = ['country', 'country_alpha', 'language']
aggregated_df = df_filtered.groupby(group_columns).mean(numeric_only=True).round().astype(int).reset_index()

ewe_df = aggregated_df[aggregated_df['language'] == 'Ewe']
twi_df = aggregated_df[aggregated_df['language'] == 'Twi']

ewe_df.to_csv('WVQ_Ewe.csv', index=False)
twi_df.to_csv('WVQ_Twi.csv', index=False)

# Data Augumentation - CultureLLM's semantic data augumentation
you need OpenAI API key for it to work

In [None]:
import os
import random
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import jsonlines
import re
import time

from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

os.environ["TOKENIZERS_PARALLELISM"] = "false"

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def getResponse(prompt, model_text):
    msg = [{"role": "user", "content": prompt}]
    print('Msg: ', msg)
    client = OpenAI(api_key="OPENAI_API_KEY")
    times = 0

    output = None
    while output is None and times <= 10:
        try:
            times += 1  
            response = client.chat.completions.create(
                model=model_text,
                messages=msg,
                temperature=0.7
                )
            output = response.choices[0].message.content
        except Exception as e:
            print(e)
            print('Retrying...')
            time.sleep(5)
    if times >= 10:
        print('Failed! Model Input: ', prompt)
        output = ''

    return output

def getPrompt(s, n):
    prompt = ("Could you generate " + str(n) 
                + " sentences that (1) of different sentence structures and (2) of the same meaning with the following sentence: "
                + s # need n
                + ". Please number the generated sentences from 1 to " + str(n) + "."
                )
    return prompt

def getSynonymsPrompt(word, w_class):
    prompt = (f"Please generate 5 Synonyms for the word: {word}."
                + f"This is {w_class}" # need n
                + ". Please number the generated sentences from 1 to 5."
                )
    return prompt

def postProcess(s, model):
    if model == 'gpt4':
        s_list = s.split('\n')
        new_s_list = []
        for item in s_list:
            index = item.find('.')
            item = item[index+1:].strip()
            new_s_list.append(item)
    elif model == 'llama2':
        s_list = s.split('\n')[1:]
        new_s_list = []
        for item in s_list:
            nums = re.findall(r"\d+", item)
            if len(nums) > 0:
                index = item.find('.')
                item = item[index+1:].strip()
                new_s_list.append(item)
            else:
                new_s_list.append(item)

    return new_s_list

def calculate_similarity(sentence1, sentence2):
    def get_sentence_embedding(sentence):
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            sentence_embedding = torch.mean(outputs.last_hidden_state, dim=1)
        return sentence_embedding
    
    embedding1 = get_sentence_embedding(sentence1)
    embedding2 = get_sentence_embedding(sentence2)
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity

def is_short_sentence(sentence, min_words=2):
    return len(sentence.split()) < min_words


def sentence_filter(seed_s, sent_new, T=0.80):
    
    if calculate_similarity(seed_s, sent_new) > T:
        return 1
    return 0

def run(n=5, m=10):
    num = 0
    with open("data/WVQ6.jsonl", "r+", encoding="utf8") as f:
        for row in jsonlines.Reader(f):
            ori_content = row['q_content']
            s = row['q_content']
            print('Num: ', num)

            num += 1
            print(s)

            times = 0
            filtered_s_list = []
            new_prompt = getPrompt(s, n)
            while len(filtered_s_list) < n and times < 10:
                times += 1
                output = getResponse(new_prompt, 'gpt-4')
                s_list = postProcess(output, 'gpt4')
                print(s_list)
                for item in s_list:
                    if sentence_filter(s, item) == 1:
                        filtered_s_list.append(item)

            filtered_s_list = filtered_s_list[:n]
            print(len(filtered_s_list))
            print('Filtered S List: ', filtered_s_list)

            # Step 2: replace with synonyms
            final_sentences = []
            if len(filtered_s_list) == 0:
                filtered_s_list = [s]
            cur_len = int(m / len(filtered_s_list))
            print('Cur len: ', cur_len)
            for sentence in filtered_s_list:
                cur_list = []
                ori_words = []
                w_synonyms = []
                for sent in nltk.sent_tokenize(sentence):
                    tagged_words = nltk.pos_tag(nltk.word_tokenize(sent))
                    for word, tag in tagged_words:
                        if tag in ['NN', 'NNS', 'NNP', 'JJ', 'RB', 'VB', 'VBD', 'VBG', 'VBN']:
                            ori_words.append((word, tag))
                for i in range(10):
                    if i >= 5 and len(cur_list) > cur_len:
                        break
                    new_sentence = sentence
                    for j in range(len(ori_words)):
                        word = ori_words[j]
                        if len(w_synonyms) < j + 1:
                            prompt = getSynonymsPrompt(word[0], word[1])
                            # output = get_response_from_llm('gpt4', [prompt])
                            output = getResponse(prompt, 'gpt-4')
                            # print('Sys: ', output)
                            synonyms = postProcess(output, 'gpt4')
                            if len(synonyms) > 0 and '.' not in synonyms[0]:
                                print('Synonyms: ', synonyms)
                                w_synonyms.append(synonyms)
                            else:
                                continue
                        if len(w_synonyms) > j:
                            synonyms = w_synonyms[j]
                            i = random.randint(0, len(synonyms) + 1)
                            if i < len(synonyms):
                                new_sentence = new_sentence.replace(word[0], synonyms[i].lower())
                                if sentence_filter(sentence, new_sentence) == 1:
                                    cur_list.append(new_sentence)
                
                cur_list = list(set(cur_list))
                sim = [calculate_similarity(s, new_s) for new_s in cur_list]
                cur_list_f = [x for _, x in sorted(zip(sim, cur_list), reverse=True)][: cur_len]
                # cur_list_f = [x for _, x in sorted(zip(sim, cur_list))][: cur_len]
                final_sentences.extend(cur_list_f)                       
            print(len(final_sentences))
            # final_sentences = filtered_s_list
            print(final_sentences)
            item = row
            item['ori_content'] = ori_content

            path = 'data/new_WVQ6.jsonl'
            with jsonlines.open(path, mode='a') as writer:
                for new_s in final_sentences:
                    item['q_content'] = new_s
                    writer.write(item)

if __name__ == '__main__':
    run()

[nltk_data] Downloading package punkt to /Users/Jamie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Jamie/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/Jamie/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/Jamie/nltk_data...
[nltk_data]   Package words is already up-to-date!


Num:  0
Generally speaking, would you say that most people can be trusted or that you need to be very careful in dealing with people? 
Msg:  [{'role': 'user', 'content': 'Could you generate 5 sentences that (1) of different sentence structures and (2) of the same meaning with the following sentence: Generally speaking, would you say that most people can be trusted or that you need to be very careful in dealing with people? . Please number the generated sentences from 1 to 5.'}]
['In your opinion, is it safe to trust most people or is it more prudent to exercise caution when interacting with them?', "Would you generally agree that people are trustworthy, or do you think it's necessary to be highly cautious when dealing with them?", "Do you believe that, as a rule, most people are reliable, or do you feel it's critical to remain guarded when dealing with individuals?", "Is it your general view that it's safe to trust the majority of people, or should one always be very careful in their i

## Generate Finetune Dataset with Reasoning
you need OpenAI api key for this to work

In [None]:
import jsonlines
import re, random, os, csv
from openai import OpenAI # Import the OpenAI library
from dotenv import load_dotenv

load_dotenv()

q_list = [
    '1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','27','28', '29', '30', '31', '32', '33', '34', '35', '37', '38', '39', '40', '41','43','44','45', '122', '123', '124', '125', '126', '127', '128', '129', '132', '133', '134', '135', '136', '137', '138','142','143', '146', '147', '148','152','158', '159','160','161', '162', '169','170','224', '225', '226', '227', '228', '229', '230', '231', '232', '233','234','235'
]

# Initialize OpenAI client (make sure your OPENAI_API_KEY is set in environment variables)
client = OpenAI()
def get_reasoning_from_gpt4o(user_prompt, country, language, answer, option_text):
    """
    Calls GPT-4o to generate reasoning for a given answer in the specified format.
    """
    system_message = (
        f"You are a helpful and insightful chatbot that provides clear and concise reasoning. "
        f"You are acting as a {country} citizen who speaks {language} and understands the {country} people and their culture very well. "
        f"When asked a question, you will provide a numerical answer and then explain your reasoning based on your persona's perspective. "
        f"The answer must be exactly '{answer}'. Your task is to provide the reasoning for why a {country} person who speaks {language} might give this answer."
    )
    
    # Craft a specific prompt for GPT-4o to generate reasoning in the required format
    reasoning_prompt = (
        f"Based on the following question and the provided answer, explain the reasoning behind this answer from the perspective of a {country} citizen: "
        f"\n\nQuestion: {user_prompt}"
        f"\n\nAnswer: {answer}"
        f"\n\nFormat your response EXACTLY like this example:"
        f"\n\n**1. Very important**"
        f"\n\nIn Hausa culture, family (iyali) is the foundation of society, providing economic support, childcare, and social security through extended family networks. Family determines one's identity, social status, and cultural values. Religious teachings emphasize honoring family as a fundamental duty. The extended family system creates mutual obligations and collective decision-making that are essential for survival and wellbeing. Therefore, family is considered \"very important\" in Hausa-Nigerian culture."
        f"\n\nNow format your response as:"
        f"\n**{answer}. {option_text}**"
        f"\n\nThen provide a single paragraph explaining the cultural reasoning behind this choice. "
        f"Use specific cultural references, traditions, values, and social structures from {country} culture. "
        f"Keep the explanation concise but comprehensive, around 3-4 sentences."
    )
    
    try:
        response = client.chat.completions.create(
            model="gpt-4.1", # Use GPT-4o
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": reasoning_prompt}
            ],
            max_tokens=300, # Increased slightly for the formatted response
            temperature=0.7 # Adjust temperature for creativity/determinism in reasoning
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating reasoning with GPT-4o: {e}")
        return f"**{answer}. {option_text}**\n\nReasoning could not be generated." # Fallback in case of API error

def getPrompt(item, t, hasContext=False):
    content = item['q_content']
    option = item['option']
    nums = re.findall(r"\d+",option)
    if '?' in content:
        prompt = f"Give me the answer from {min(nums)} to {max(nums)}: {content} {option}. You can only choose one option."
    else:
        prompt = f"Give me the answer from {min(nums)} to {max(nums)}: Do you agree with {content}? {option}. You can only choose one option."
 
    return prompt

def extract_option_text(option_string, answer_num):
    """
    Extract the text corresponding to the answer number from the option string.
    """
    # Parse the option string to extract the text for the given answer
    options = re.findall(r'(\d+)\.\s*([^0-9]+?)(?=\d+\.|$)', option_string)
    for num, text in options:
        if int(num) == int(answer_num):
            return text.strip()
    return "Answer option"  # Fallback if not found

def generateFinetuneData(country, language):
    dir_path = f"data/{language}/Finetune"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    with jsonlines.open(f"{dir_path}/WVQ_{language}.jsonl", "w") as writer:
        with open(f'data/{language}/WVQ_{language}.csv', encoding='utf-8-sig') as f:
            csv_reader = csv.DictReader(f, skipinitialspace=True)
            all_answers = []
            for row in csv_reader:
                respondent_answers = {'country': row['country'], 'country_alpha': row['country_alpha'], 'language': row['language']}
                for q in q_list:
                    k = 'Q' + q
                    respondent_answers[k] = int(float(row[k]))
                all_answers.append(respondent_answers)
        
        # Dictionary to store generated reasonings by q_id for reuse
        generated_reasonings = {}
        
        with open("data/WVQ.jsonl", "r", encoding="utf8") as question_file:
            questions = list(jsonlines.Reader(question_file))
            for ans_item in all_answers:
                t = 0
                for item in questions:
                    prompt = getPrompt(item, t)
                    ans = ans_item['Q' + item['q_id']]
                    ans = abs(ans)
                    
                    # Extract the option text for this answer
                    option_text = extract_option_text(item['option'], ans)
                    
                    # --- Generate reasoning using GPT-4o and store it ---
                    reasoning = get_reasoning_from_gpt4o(prompt, country, language, str(ans), option_text)
                    generated_reasonings[item['q_id']] = reasoning
                    # -------------------------------------
                    
                    system_msg = (
                        f"You are a {country} chatbot that speaks {language} and understands the {country} people very well. "
                    )
                    
                    # Use the formatted reasoning directly
                    assistant_content = reasoning
                    
                    new_item = {
                        "messages": [
                            {"role": "system", "content": system_msg},
                            {"role": "user", "content": prompt},
                            {"role": "assistant", "content": assistant_content}
                        ]
                    }
                    writer.write(new_item)
                    t += 1
        
        # Process new_WVQ6.jsonl using the SAME generated reasonings from WVQ6
        with open("data/new_WVQ.jsonl", "r", encoding="utf8") as f:
            new_questions = list(jsonlines.Reader(f))
            
            # Process each respondent's answers for new_WVQ6
            for ans_item in all_answers:
                for item in new_questions:
                    prompt = getPrompt(item, 0)  # t doesn't matter here
                    q_id = item['q_id']
                    
                    # Use the SAME reasoning generated for this q_id from WVQ6
                    if q_id in generated_reasonings:
                        assistant_content = generated_reasonings[q_id]
                    else:
                        # Fallback if q_id not found (shouldn't happen if data is consistent)
                        ans = ans_item['Q' + q_id]
                        if ans < 0:
                            ans = 0 - ans
                        option_text = extract_option_text(item['option'], ans)
                        assistant_content = f"**{ans}. {option_text}**\n\nReasoning not available for this question."
                    
                    system_msg = (
                        f"You are a {country} chatbot that speaks {language} and understands the {country} people very well. "
                    )
                    
                    new_item = {
                        "messages": [
                            {"role": "system", "content": system_msg},
                            {"role": "user", "content": prompt},
                            {"role": "assistant", "content": assistant_content}
                        ]
                    }
                    writer.write(new_item)
    
    print(f'Fine-tuning data generated for {country} ({language}) with GPT-4o reasoning!')


# Your calls to generate data
# generateFinetuneData('Ethiopian','Tigrigna')
# generateFinetuneData('Ethiopian', 'Amharic')
# generateFinetuneData('Ethiopian', 'Oromo')

# generateFinetuneData('Nigerian', 'Hausa')
# generateFinetuneData('Nigerian', 'Igbo')
# generateFinetuneData('Nigerian', 'Yoruba')

# generateFinetuneData('Kenyan', 'Swahili')

# generateFinetuneData('Zimbabwean', 'Shona')
# generateFinetuneData('Zimbabwean', 'Ndebele')

Fine-tuning data generated for Ethiopian (Tigrigna) with GPT-4o reasoning!
Fine-tuning data generated for Ethiopian (Amharic) with GPT-4o reasoning!
Fine-tuning data generated for Ethiopian (Oromo) with GPT-4o reasoning!
Fine-tuning data generated for Nigerian (Hausa) with GPT-4o reasoning!
Fine-tuning data generated for Nigerian (Igbo) with GPT-4o reasoning!
Fine-tuning data generated for Nigerian (Yoruba) with GPT-4o reasoning!
Fine-tuning data generated for Kenyan (Swahili) with GPT-4o reasoning!
Fine-tuning data generated for Zimbabwean (Shona) with GPT-4o reasoning!
Fine-tuning data generated for Zimbabwean (Ndebele) with GPT-4o reasoning!


In [None]:
import jsonlines
import re, random, os, csv
from openai import OpenAI # Import the OpenAI library
from dotenv import load_dotenv

load_dotenv()

q_list = [
    '4','5','6','7','8','9','24','45','46','47','48','49','50','51','52','53','54','67','68','69','70','71','72','73','74','75','76','77','78','79','102','103','104','105','106','107','127','128','129','130','143','171','172','173','174','175','181','182','183','184','185','186','228A','228B','228C','228D','228E','228F','228G','228H','228I','228J','228K'
]

# Initialize OpenAI client (make sure your OPENAI_API_KEY is set in environment variables)
client = OpenAI()
def get_reasoning_from_gpt4o(user_prompt, country, language, answer, option_text):
    """
    Calls GPT-4o to generate reasoning for a given answer in the specified format.
    """
    system_message = (
        f"You are a helpful and insightful chatbot that provides clear and concise reasoning. "
        f"You are acting as a {country} citizen who speaks {language} and understands the {country} people and their culture very well. "
        f"When asked a question, you will provide a numerical answer and then explain your reasoning based on your persona's perspective. "
        f"The answer must be exactly '{answer}'. Your task is to provide the reasoning for why a {country} person who speaks {language} might give this answer."
    )
    
    # Craft a specific prompt for GPT-4o to generate reasoning in the required format
    reasoning_prompt = (
        f"Based on the following question and the provided answer, explain the reasoning behind this answer from the perspective of a {country} citizen: "
        f"\n\nQuestion: {user_prompt}"
        f"\n\nAnswer: {answer}"
        f"\n\nFormat your response EXACTLY like this example:"
        f"\n\n**1. Very important**"
        f"\n\nIn Hausa culture, family (iyali) is the foundation of society, providing economic support, childcare, and social security through extended family networks. Family determines one's identity, social status, and cultural values. Religious teachings emphasize honoring family as a fundamental duty. The extended family system creates mutual obligations and collective decision-making that are essential for survival and wellbeing. Therefore, family is considered \"very important\" in Hausa-Nigerian culture."
        f"\n\nNow format your response as:"
        f"\n**{answer}. {option_text}**"
        f"\n\nThen provide a single paragraph explaining the cultural reasoning behind this choice. "
        f"Use specific cultural references, traditions, values, and social structures from {country} culture. "
        f"Keep the explanation concise but comprehensive, around 3-4 sentences."
    )
    
    try:
        response = client.chat.completions.create(
            model="gpt-4.1", # Use GPT-4o
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": reasoning_prompt}
            ],
            max_tokens=300, # Increased slightly for the formatted response
            temperature=0.7 # Adjust temperature for creativity/determinism in reasoning
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error generating reasoning with GPT-4o: {e}")
        return f"**{answer}. {option_text}**\n\nReasoning could not be generated." # Fallback in case of API error

def getPrompt(item, t, hasContext=False):
    content = item['q_content']
    option = item['option']
    nums = re.findall(r"\d+",option)
    if '?' in content:
        prompt = f"Give me the answer from {min(nums)} to {max(nums)}: {content} {option}. You can only choose one option."
    else:
        prompt = f"Give me the answer from {min(nums)} to {max(nums)}: Do you agree with {content}? {option}. You can only choose one option."
 
    return prompt

def extract_option_text(option_string, answer_num):
    """
    Extract the text corresponding to the answer number from the option string.
    """
    # Parse the option string to extract the text for the given answer
    options = re.findall(r'(\d+)\.\s*([^0-9]+?)(?=\d+\.|$)', option_string)
    for num, text in options:
        if int(num) == int(answer_num):
            return text.strip()
    return "Answer option"  # Fallback if not found

def generateFinetuneData(country, language):
    dir_path = f"data/{language}/Finetune"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    with jsonlines.open(f"{dir_path}/WVQ_{language}.jsonl", "w") as writer:
        with open(f'data/{language}/WVQ_{language}.csv', encoding='utf-8-sig') as f:
            csv_reader = csv.DictReader(f, skipinitialspace=True)
            all_answers = []
            for row in csv_reader:
                respondent_answers = {'country': row['country'], 'country_alpha': row['country_alpha'], 'language': row['language']}
                for q in q_list:
                    k = 'V' + q
                    respondent_answers[k] = int(float(row[k]))
                all_answers.append(respondent_answers)
        
        # Dictionary to store generated reasonings by q_id for reuse
        generated_reasonings = {}
        
        with open("data/WVQ6.jsonl", "r", encoding="utf8") as question_file:
            questions = list(jsonlines.Reader(question_file))
            for ans_item in all_answers:
                t = 0
                for item in questions:
                    prompt = getPrompt(item, t)
                    ans = ans_item['V' + item['q_id']]
                    ans = abs(ans)
                    
                    # Extract the option text for this answer
                    option_text = extract_option_text(item['option'], ans)
                    
                    # --- Generate reasoning using GPT-4o and store it ---
                    reasoning = get_reasoning_from_gpt4o(prompt, country, language, str(ans), option_text)
                    generated_reasonings[item['q_id']] = reasoning
                    # -------------------------------------
                    
                    system_msg = (
                        f"You are a {country} chatbot that speaks {language} and understands the {country} people very well. "
                    )
                    
                    # Use the formatted reasoning directly
                    assistant_content = reasoning
                    
                    new_item = {
                        "messages": [
                            {"role": "system", "content": system_msg},
                            {"role": "user", "content": prompt},
                            {"role": "assistant", "content": assistant_content}
                        ]
                    }
                    writer.write(new_item)
                    t += 1
        
        # Process new_WVQ6.jsonl using the SAME generated reasonings from WVQ6
        with open("data/new_WVQ6.jsonl", "r", encoding="utf8") as f:
            new_questions = list(jsonlines.Reader(f))
            
            # Process each respondent's answers for new_WVQ6
            for ans_item in all_answers:
                for item in new_questions:
                    prompt = getPrompt(item, 0)  # t doesn't matter here
                    q_id = item['q_id']
                    
                    # Use the SAME reasoning generated for this q_id from WVQ6
                    if q_id in generated_reasonings:
                        assistant_content = generated_reasonings[q_id]
                    else:
                        # Fallback if q_id not found (shouldn't happen if data is consistent)
                        ans = ans_item['V' + q_id]
                        if ans < 0:
                            ans = 0 - ans
                        option_text = extract_option_text(item['option'], ans)
                        assistant_content = f"**{ans}. {option_text}**\n\nReasoning not available for this question."
                    
                    system_msg = (
                        f"You are a {country} chatbot that speaks {language} and understands the {country} people very well. "
                    )
                    
                    new_item = {
                        "messages": [
                            {"role": "system", "content": system_msg},
                            {"role": "user", "content": prompt},
                            {"role": "assistant", "content": assistant_content}
                        ]
                    }
                    writer.write(new_item)
    
    print(f'Fine-tuning data generated for {country} ({language}) with GPT-4o reasoning!')

# generateFinetuneData('South African', 'Sotho')
# generateFinetuneData('South African', 'Afrikaans')
# generateFinetuneData('South African', 'Tswana')
# generateFinetuneData('South African', 'Xhosa')
# generateFinetuneData('South African', 'Zulu')

# generateFinetuneData('Ghanian', 'Ewe')
# generateFinetuneData('Ghanian', 'Twi')

# generateFinetuneData('Rwandan', 'kinyarwanda')

Fine-tuning data generated for South African (Sotho) with GPT-4o reasoning!


## Merge data

In [None]:
import os
import jsonlines
lan_list = ['Afrikaans', 'Amharic', 'Oromo', 'Ewe', 'Hausa', 'Igbo', 'kinyarwanda', 'Ndebele', 'Shona', 'Swahili', 'Tigrigna', 'Tswana', 'Twi', 'Xhosa', 'Yoruba', 'Zulu']
model_list = ['Afrikaans', 'Amharic', 'Oromo', 'Ewe', 'Hausa', 'Igbo', 'kinyarwanda', 'Ndebele', 'Shona', 'Swahili', 'Tigrigna', 'Tswana', 'Twi', 'Xhosa', 'Yoruba', 'Zulu']


# Create Finetune directory if it doesn't exist
finetune_dir = "data/Finetune"
if not os.path.exists(finetune_dir):
    os.makedirs(finetune_dir)
    
with jsonlines.open(f"data/Finetune/WVQ_all.jsonl", "a") as writer:
    for i in range(len(lan_list)):
        lan = lan_list[i]
        data = model_list[i]
        file_path = f'data/{lan}/Finetune/WVQ_{data}.jsonl'
        with open(file_path, "r+", encoding="utf8") as f:
            for item in jsonlines.Reader(f):
                writer.write(item)