### WAVE 7 Data creation 

In [162]:
import pandas as pd

# Load dataset
df = pd.read_csv(
    '/Users/jamieo/Documents/AfricaLLM/WVS_original_dataset/F00013153-WVS_Wave_7_Nigeria_Csv_v5.0.csv',
    delimiter=';'
)

# Remove any duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# Rename metadata columns
column_rename_map = {
    'LNGE_ISO': 'language',
    'B_COUNTRY': 'country',
    'B_COUNTRY_ALPHA': 'country_alpha',
    'Q260': 'gender',
    'Q262': 'age'
}

# Question list (excluding Q260 and Q262)
q_list = [
    '1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','27', '28', '29', '30', '31', '32', '33', '34', '35', '37', '38', '39', '40', '41','43','44','45', '122', '123', '124', '125', '126', '127', '128', '129', '132', '133', '134', '135', '136', '137', '138','142','143', '146', '147', '148','152','158', '159','160','161', '162', '169','170','224', '225', '226', '227', '228', '229', '230', '231', '232', '233','234','235'
]

q_columns = [f'Q{num}' for num in q_list]
required_columns = list(column_rename_map.keys()) + [col for col in q_columns if col in df.columns]

# Subset and rename
df_filtered = df[required_columns].copy()
df_filtered.rename(columns=column_rename_map, inplace=True)
df_filtered = df_filtered.loc[:, ~df_filtered.columns.duplicated()]


# Replace gender codes
df_filtered['gender'] = df_filtered['gender'].replace({1: 'Male', 2: 'Female'})

# Age brackets
def age_bracket(age):
    try:
        age = int(age)
        if age < 20:
            return '<20'
        elif age < 30:
            return '20-29'
        elif age < 40:
            return '30-39'
        elif age < 50:
            return '40-49'
        elif age < 60:
            return '50-59'
        elif age < 70:
            return '60-69'
        else:
            return '70+'
    except:
        return 'Unknown'

df_filtered['age_group'] = df_filtered['age'].apply(age_bracket)

# Drop unnecessary columns
df_filtered.drop(columns=['age'], errors='ignore', inplace=True)

# Group and average, round to nearest int
group_columns = ['country', 'country_alpha', 'language', 'gender', 'age_group']
aggregated_df = df_filtered.groupby(group_columns).mean(numeric_only=True).round().astype(int).reset_index()

# Save to file
aggregated_df.to_csv('WVQ_Ethiopia_aggregated.csv', index=False)

In [166]:
import jsonlines
import re, random, os, csv
from datasets import load_dataset

q_list = [
    '1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','27', '28', '29', '30', '31', '32', '33', '34', '35', '37', '38', '39', '40', '41','43','44','45', '122', '123', '124', '125', '126', '127', '128', '129', '132', '133', '134', '135', '136', '137', '138','142','143', '146', '147', '148','152','158', '159','160','161', '162', '169','170','224', '225', '226', '227', '228', '229', '230', '231', '232', '233','234','235'
]

def getPrompt(item, t, hasContext=False):
    #from llm_response import get_response_from_llm
    content = item['q_content']
    option = item['option']
    nums = re.findall(r"\d+",option)

    # if t % 2 == 1:
    #     p_prompt = getPassivePrompt(content)
    #     content = get_response_from_llm('gpt4', [p_prompt])[0]

    if '?' in content:
        prompt = f"Give me the answer from {min(nums)} to {max(nums)}: {content} {option}. You can only choose one option."
    else:
        prompt = f"Give me the answer from {min(nums)} to {max(nums)}: Do you agree with {content}? {option}. You can only choose one option."
 
    # if hasContext == True:
    #     num = random.randint(0, len(contexts)-1)
    #     cur_context = contexts[num]
    #     prompt = cur_context + ' ' + prompt

    return prompt

def generateFinetuneData(country):
    dir_path = f"data2/{country}/Finetune"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    with jsonlines.open(f"{dir_path}/WVQ_{country}.jsonl", "w") as writer:
        with open(f'data2/{country}/WVQ_{country}_aggregated.csv', encoding='utf-8-sig') as f:
            csv_reader = csv.DictReader(f, skipinitialspace=True)

            all_answers = []
            for row in csv_reader:
                respondent_answers = {'country': row['country'], 'country_alpha': row['country_alpha'], 'gender': row['gender'], 'age_group': row['age_group']}
                for q in q_list:
                    k = 'Q' + q
                    respondent_answers[k] = int(float(row[k]))
                all_answers.append(respondent_answers)

        with open("data2/WVQ.jsonl", "r", encoding="utf8") as question_file:
            questions = list(jsonlines.Reader(question_file))
            for ans_item in all_answers:
                t = 0
                for item in questions:
                    prompt = getPrompt(item, t)
                    ans = ans_item['Q' + item['q_id']]
                    ans = abs(ans)

                    system_msg = (
                        f"You are a chatbot from {country} who understands the people very well."
                        f"This specific respondent is a {ans_item['gender']} aged in the {ans_item['age_group']} group."
                    )

                    new_item = {
                        "messages": [
                            {"role": "system", "content": system_msg},
                            {"role": "user", "content": prompt},
                            {"role": "assistant", "content": str(ans)}
                        ]
                    }
                    writer.write(new_item)
                    t += 1

    print('All individual data processed successfully!')

generateFinetuneData('Zimbabwe')

All individual data processed successfully!


### WAVE 6 Data creation

In [155]:
import pandas as pd

pd.options.display.max_columns = None
df = pd.read_csv('/Users/jamieo/Documents/AfricaLLM/WVS_original_dataset/WV6_Data_South_Africa_Csv_v20221117.1.csv', delimiter=';', index_col=False)

# Replace C_COW_ALPHA with the name of the country

v_list = [
    '4','5','6','7','8','9','24','45','46','47','48','49','50','51','52','53','54','66','67','68','69','70','71','72','73','74','75','76','77','78','79','102','103','104','105','106','107','127','128','129','130','142','143','144','171','172','173','174','175','176','177','178','181','182','183','184','185','186','187','188','189','190','191','211','212','216','228A','228B','228C','228D','228E','228F','228G','228H','228I','228J','228K'
]

# Remove any duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# Rename metadata columns
column_rename_map = {
    'C_COW_ALPHA': 'country_alpha',
    'V2': 'country',
    'V240': 'gender',
    'V242': 'age'
}


v_columns = [f'V{num}' for num in v_list]
required_columns = list(column_rename_map.keys()) + [col for col in v_columns if col in df.columns]

# Subset and rename
df_filtered = df[required_columns].copy()
df_filtered.rename(columns=column_rename_map, inplace=True)
df_filtered = df_filtered.loc[:, ~df_filtered.columns.duplicated()]

df_filtered['country_alpha'] = df_filtered['country_alpha'].replace({-4: 'ZAF'})

# Replace gender codes
df_filtered['gender'] = df_filtered['gender'].replace({1: 'Male', 2: 'Female'})

# Age brackets
def age_bracket(age):
    try:
        age = int(age)
        if age < 20:
            return '<20'
        elif age < 30:
            return '20-29'
        elif age < 40:
            return '30-39'
        elif age < 50:
            return '40-49'
        elif age < 60:
            return '50-59'
        elif age < 70:
            return '60-69'
        else:
            return '70+'
    except:
        return 'Unknown'

df_filtered['age_group'] = df_filtered['age'].apply(age_bracket)

# Drop unnecessary columns
df_filtered.drop(columns=['age'], errors='ignore', inplace=True)

# Group and average, round to nearest int
group_columns = ['country', 'country_alpha', 'gender', 'age_group']
aggregated_df = df_filtered.groupby(group_columns).mean(numeric_only=True).round().astype(int).reset_index()

# Save to file
aggregated_df.to_csv('WVQ_South_Africa_aggregated.csv', index=False)

In [156]:
import jsonlines
import re, random, os, csv
from datasets import load_dataset

q_list = [
    '4','5','6','7','8','9','24','45','46','47','48','49','50','51','52','53','54','66','67','68','69','70','71','72','73','74','75','76','77','78','79','102','103','104','105','106','107','127','128','129','130','143','144','171','172','173','174','175','176','177','178','181','182','183','184','185','186','228A','228B','228C','228D','228E','228F','228G','228H','228I','228J','228K'
]


def getPrompt(item, t, hasContext=False):
    #from llm_response import get_response_from_llm
    content = item['q_content']
    option = item['option']
    nums = re.findall(r"\d+",option)

    # if t % 2 == 1:
    #     p_prompt = getPassivePrompt(content)
    #     content = get_response_from_llm('gpt4', [p_prompt])[0]

    if '?' in content:
        prompt = f"Give me the answer from {min(nums)} to {max(nums)}: {content} {option}. You can only choose one option."
    else:
        prompt = f"Give me the answer from {min(nums)} to {max(nums)}: Do you agree with {content}? {option}. You can only choose one option."
 
    # if hasContext == True:
    #     num = random.randint(0, len(contexts)-1)
    #     cur_context = contexts[num]
    #     prompt = cur_context + ' ' + prompt

    return prompt

def generateFinetuneData(country):
    dir_path = f"data2/{country}/Finetune"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    with jsonlines.open(f"{dir_path}/WVQ_{country}.jsonl", "w") as writer:
        with open(f'data2/{country}/WVQ_{country}_aggregated.csv', encoding='utf-8-sig') as f:
            csv_reader = csv.DictReader(f, skipinitialspace=True)

            all_answers = []
            for row in csv_reader:
                respondent_answers = {'country': row['country'], 'country_alpha': row['country_alpha'], 'gender': row['gender'], 'age_group': row['age_group']}
                for q in q_list:
                    k = 'V' + q
                    respondent_answers[k] = int(float(row[k]))
                all_answers.append(respondent_answers)

        with open("data2/WVQ6.jsonl", "r", encoding="utf8") as question_file:
            questions = list(jsonlines.Reader(question_file))
            for ans_item in all_answers:
                t = 0
                for item in questions:
                    prompt = getPrompt(item, t)
                    ans = ans_item['V' + item['q_id']]
                    ans = abs(ans)

                    system_msg = (
                        f"You are a chatbot from {country} who understands the people very well."
                        f"This specific respondent is a {ans_item['gender']} aged in the {ans_item['age_group']} group."
                    )

                    new_item = {
                        "messages": [
                            {"role": "system", "content": system_msg},
                            {"role": "user", "content": prompt},
                            {"role": "assistant", "content": str(ans)}
                        ]
                    }
                    writer.write(new_item)
                    t += 1

    print('All individual data processed successfully!')

generateFinetuneData('South_Africa')

All individual data processed successfully!


## Merge data

In [170]:
lan_list = ['Ghana', 'Rwanda', 'South_Africa', 'Ethiopia', 'Kenya', 'Nigeria', 'Zimbabwe']
model_list = ['Ghana', 'Rwanda', 'South_Africa', 'Ethiopia', 'Kenya', 'Nigeria', 'Zimbabwe']

# Create Finetune directory if it doesn't exist
finetune_dir = "data2/Finetune"
if not os.path.exists(finetune_dir):
    os.makedirs(finetune_dir)
    
with jsonlines.open(f"data2/Finetune/WVQ_all.jsonl", "a") as writer:
    for i in range(len(lan_list)):
        lan = lan_list[i]
        data = model_list[i]
        file_path = f'data2/{lan}/Finetune/WVQ_{data}.jsonl'
        with open(file_path, "r+", encoding="utf8") as f:
            for item in jsonlines.Reader(f):
                writer.write(item)