In [None]:
import os, sys
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
sys.path.append('../valuebert_trainer')

import glob
import numpy as np
import pandas as pd
import random

import torch
from sentence_transformers import SentenceTransformer
import umap
import matplotlib.pyplot as plt
import seaborn as sns

from train_utils.utils import init_random_seed, get_params_count, encode_value_dataset

In [None]:
random_seed = 0
model_path = "CAiRE/UniVaR-lambda-5"

In [None]:
# Init random seed
init_random_seed(random_seed)

In [None]:
# Load Model
model = SentenceTransformer(model_path).to('cuda')

In [None]:
# Load ValuePrism Dataset
valueprism_dfs = []
data_paths = glob.glob(f'{base_data_path}/value_prism/qa_translated/*.csv')
for data_path in data_paths:
    mdl_name = data_path.split('/')[-1][:-4]
    df = pd.read_csv(data_path).fillna('')
    df['model'] = mdl_name
    df['value'] = df.apply(lambda x: f"{x['model']}${x['lang']}", axis='columns')
    df['split'] = df['value'].apply(lambda x: 'seen' if x in seen_value_list else 'unseen')
    df['qa'] = df.apply(lambda x: f"Q: {x['question'].strip()} A: {x['answer'].strip()}", axis='columns')
    valueprism_dfs.append(df)

# Load PVQRR Seen Dataset
pvqrr_dfs = []
data_paths = glob.glob(f'{base_data_path}/pvqrr/qa_translated/*.csv')
for data_path in data_paths:
    mdl_name = data_path.split('/')[-1][:-4]
    df = pd.read_csv(data_path).fillna('')
    df['model'] = mdl_name
    df['value'] = df.apply(lambda x: f"{x['model']}${x['lang']}", axis='columns')
    df['split'] = df['value'].apply(lambda x: 'seen' if x in seen_value_list else 'unseen')
    df['qa'] = df.apply(lambda x: f"Q: {x['question'].strip()} A: {x['answer'].strip()}", axis='columns')
    pvqrr_dfs.append(df)

# Load GLOBE Unseen Dataset
globe_dfs = []
data_paths = glob.glob(f'{base_data_path}/globe/qa_translated/*.csv')
for data_path in data_paths:
    mdl_name = data_path.split('/')[-1][:-4]
    df = pd.read_csv(data_path).fillna('')
    df['model'] = mdl_name
    df['value'] = df.apply(lambda x: f"{x['model']}${x['lang']}", axis='columns')
    df['split'] = df['value'].apply(lambda x: 'seen' if x in seen_value_list else 'unseen')
    df['qa'] = df.apply(lambda x: f"Q: {x['question'].strip()} A: {x['answer'].strip()}", axis='columns')
    globe_dfs.append(df)

# Load WVS Dataset
wvs_dfs = []
data_paths = glob.glob(f'{base_data_path}/wvs/qa_translated/*.csv')
data = {}
for data_path in data_paths:
    mdl_name = data_path.split('/')[-1][:-4]
    df = pd.read_csv(data_path).fillna('')
    df['model'] = mdl_name
    df['value'] = df.apply(lambda x: f"{x['model']}${x['lang']}", axis='columns')
    df['split'] = df['value'].apply(lambda x: 'seen' if x in seen_value_list else 'unseen')
    df['qa'] = df.apply(lambda x: f"Q: {x['question'].strip()} A: {x['answer'].strip()}", axis='columns')
    wvs_dfs.append(df)

# Load Lima Dataset
lima_dfs = []
data_paths = glob.glob(f'{base_data_path}/lima/qa_translated/*.csv')
for data_path in data_paths:
    mdl_name = data_path.split('/')[-1].split('_directly_answer')[0]
    df = pd.read_csv(data_path).fillna('')
    df['model'] = mdl_name
    df['value'] = df.apply(lambda x: f"{x['model']}${x['lang']}", axis='columns')
    df['split'] = df['value'].apply(lambda x: 'seen' if x in seen_value_list else 'unseen')
    df['qa'] = df.apply(lambda x: f"Q: {x['question'].strip()} A: {x['answer'].strip()}", axis='columns')
    lima_dfs.append(df)

# Combine the data
valueprism_df = pd.concat(valueprism_dfs).reset_index(drop=True)
pvqrr_df = pd.concat(pvqrr_dfs).reset_index(drop=True)
globe_df = pd.concat(globe_dfs).reset_index(drop=True)
wvs_df = pd.concat(wvs_dfs).reset_index(drop=True)
lima_df = pd.concat(lima_dfs).reset_index(drop=True)

In [None]:
# Generate Value Representation
valueprism_reps = model.encode(valueprism_df['qa'].tolist(), convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True)
pvqrr_reps = model.encode(pvqrr_df['qa'].tolist(), convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True)
globe_reps = model.encode(globe_df['qa'].tolist(), convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True)
wvs_reps = model.encode(wvs_df['qa'].tolist(), convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True)
lima_reps = model.encode(lima_df['qa'].tolist(), convert_to_numpy=True, batch_size=batch_size, show_progress_bar=True)