In [1]:
# Associate words with archetypes/character traits as intermediate layer
# and with influencer as the "last" layer

# Dependencies
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import os
import toml
import re
import itertools
from text_cleaner import *
import operator
from collections import Counter
import pickle

def extract_hashtags(post_text):
    HASH_RE = re.compile(r"\#\w+")
    out_list = re.findall(HASH_RE, post_text)
    return out_list

In [3]:
# Load the .csv with archetypes
arch_df = pd.read_csv('archetypes_pl_new.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [4]:
# Table preprocessing - replace all NaN with 2 (Unrelated/Don't know class), replace 0-5 values with the ones in range -1.0 - 1.0
arch_df = arch_df.fillna(2)

# Remove duplicated annotations, to exclude conflicting entries
arch_df = arch_df[~arch_df.index.duplicated(keep='first')]

# Print the head of the dataset after modification
arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [5]:
# Check if a user has a non-empty directory in the dataset, otherwise delete the user from the list
available_arch_df = copy.deepcopy(arch_df)
posts = []

BASE_DIR = "instagram_cleared"

# Iterate over whole DataFrame
for i, row in tqdm(arch_df.iterrows()):
    profile_posts = []
    profile_hashtags = []
    
    # Get all posts per profile
    profile_path = os.path.join(BASE_DIR, i)
    for file in os.listdir(profile_path):
        if not file.endswith(".toml"):
            with open(os.path.join(profile_path, file), "r") as post_f:
                read_text = post_f.read()
                profile_posts.append(remove_stopwords(clean_up_text(read_text)))
                profile_hashtags.append(extract_hashtags(read_text))

    # Merge lists - a single list for a single influencer
    profile_hashtags = list(itertools.chain.from_iterable(profile_hashtags))
    posts.append(list(itertools.chain.from_iterable([profile_posts, [profile_hashtags]])))

685it [00:20, 33.00it/s]


In [6]:
# Map usernames to indices
users = list(available_arch_df.index.values)
user_indices = {k: users.index(k) for k in users}

In [7]:
# Get the word count and create a dataframe, where columns are archetypes/traits, and rows are single words
# Initialize a word DataFrame
word_df = pd.DataFrame()

def merge_dicts(dict_a, dict_b) -> dict:
    out_dict = dict_a
    for k, v in dict_b.items():
        if k in out_dict.keys():
            out_dict[k] += v
        else:
            out_dict[k] = v
    return out_dict

# Iterate over all of the traits/archetypes
for trait in tqdm(trait_list):
    # Select influencers which have the given archetype annotated
    subset_df = arch_df[trait]
    subset_indices = [user_indices[idx] for idx in subset_df.index.values]
    trait_weights = subset_df.tolist()
    
    # Get all posts for the list of influencers
    f = operator.itemgetter(*subset_indices)
    sublist = list(f(posts))
    
    # Counter to calculate each word occurrences
    trait_total = 0
    out_dict = {}
    for i, post_set in enumerate(sublist):
        trait_ctr = Counter(itertools.chain.from_iterable(post_set))
        trait_total += sum(trait_ctr.values())
        for key in trait_ctr:
            trait_ctr[key] *= trait_weights[i]
        out_dict = merge_dicts(out_dict, trait_ctr)
    out_dict = {k: float(v / trait_total) for k, v in out_dict.items()}
    trait_ctr = {trait: out_dict}
    
    # Append the new dataframe
    tmp_df = pd.DataFrame.from_dict(trait_ctr, orient="index")
    word_df = word_df.append(tmp_df)

100%|██████████| 37/37 [05:40<00:00,  9.19s/it]


In [8]:
# AGDS - discrete approach, classification-like
# Try to associate word with trait association class, not the trait itself

def merge_dicts(dict_a, dict_b) -> dict:
    out_dict = dict_a
    for k, v in dict_b.items():
        if k in out_dict.keys():
            out_dict[k] += v
        else:
            out_dict[k] = v
    return out_dict

# Iterate over all of the traits/archetypes
word_set = set()
trait_df_list = []
for trait in tqdm(trait_list):
    # Select influencers which have the given archetype annotated
    subset_df = arch_df[trait]
    trait_subframe = pd.DataFrame()
    for trait_class in range(5):
        class_df = subset_df.loc[subset_df == trait_class]
        subset_indices = [user_indices[idx] for idx in class_df.index.values]

        # Get all posts for the list of influencers
        f = operator.itemgetter(*subset_indices)
        sublist = list(f(posts))

        # Counter to calculate each word occurrences
        trait_total = 0
        out_dict = {}
        for i, post_set in enumerate(sublist):
            trait_ctr = Counter(itertools.chain.from_iterable(post_set))
            trait_total += sum(trait_ctr.values())
            out_dict = merge_dicts(out_dict, trait_ctr)
        out_dict = {k: float(v / trait_total) for k, v in out_dict.items()}
        word_set.update(out_dict.keys())
        trait_ctr = {trait_class: out_dict}
        trait_tmp_df = pd.DataFrame.from_dict(trait_ctr, orient="index")
        trait_subframe = trait_subframe.append(trait_tmp_df)

    # Append the new dataframe
    #word_df = word_df.append(trait_subframe)
    trait_df_list.append(trait_subframe)
    
softmax_word_df = pd.concat(trait_df_list, keys=trait_list)

100%|██████████| 37/37 [20:25<00:00, 33.13s/it]


In [9]:
# Check the calculation results
softmax_word_df

Unnamed: 0,Unnamed: 1,zainspirowany,wczorajszym,wywiadem,odnośnie,relacji,chciałem,przekazać,okres,kwarantanny,świetny,...,#piekewdomu,#dish,#kremzcukinii,#zucchina,#pranzoitaliano,#tradycyjnejedzenie,#tatar,#meatlover,#zachcianki,#pierogizkapusta
innocent,0,0.000021,0.000043,2.125173e-05,0.000043,0.000149,0.000106,0.000064,0.000149,0.000149,0.000128,...,,,,,,,,,,
innocent,1,,0.000013,,0.000013,0.000243,0.000013,0.000013,0.000040,0.000040,0.000013,...,,,,,,,,,,
innocent,2,0.000002,0.000019,,0.000045,0.000112,0.000074,0.000045,0.000100,0.000045,0.000095,...,,,,,,,,,,
innocent,3,0.000006,0.000034,9.158188e-07,0.000037,0.000173,0.000026,0.000030,0.000132,0.000039,0.000071,...,,,,,,,,,,
innocent,4,,0.000016,,0.000055,0.000079,0.000008,0.000040,0.000079,0.000047,0.000040,...,0.000040,0.000016,0.000008,0.000016,0.000008,0.000008,0.000008,0.000008,0.000008,0.000008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
allocentric,0,0.000016,0.000033,3.259984e-05,0.000049,0.000228,0.000081,0.000081,0.000081,0.000065,0.000081,...,,,,,,,,,,
allocentric,1,0.000009,0.000033,,0.000033,0.000164,0.000021,0.000021,0.000270,0.000048,0.000061,...,,,,,,,,,,
allocentric,2,,0.000021,,0.000036,0.000088,0.000047,0.000031,0.000109,0.000026,0.000109,...,,,,,,,,,,
allocentric,3,0.000005,0.000027,,0.000046,0.000145,0.000046,0.000028,0.000085,0.000042,0.000076,...,0.000006,0.000002,0.000001,0.000002,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001


In [10]:
# Fill NaN values with 0
softmax_word_df = softmax_word_df.fillna(0)

import pickle

with open("softmax_full_influencer_index_map.pickle", "wb") as f:
    pickle.dump(user_indices, f)
    
softmax_word_df.to_pickle("softmax_full_word_trait_array.pickle")

In [11]:
# Fill NaN values with 0
word_df = word_df.fillna(0)

In [12]:
# Save the result to a pickle
import pickle

with open("full_influencer_index_map.pickle", "wb") as f:
    pickle.dump(user_indices, f)
    
word_df.to_pickle("full_word_trait_array.pickle")

In [13]:
def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    filtered_post += extract_hashtags(post_text)
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    word_ctr = Counter(filtered_post)
    
    for word, freq in word_ctr.items():
        if word in word_map:
            post_vector[word_map.index(word)] = freq
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    
    return word_dot

# Replace NaN with 0 in word_frequency_table
word_df = word_df.fillna(0)

# Method for calculating the dot product of trait <-> influencer relation
def get_influencer_dot_product(trait_output: list, influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    return influencer_dataframe.dot(trait_output)

# Method for calculating the similarity
def calculate_similarity(post_text: str, 
                         word_map: list, 
                         word_dataframe: pd.DataFrame,
                         influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    # Calculate word-trait dot product
    post_result = get_trait_dot_product(post_text, word_map, word_dataframe)
    
    # Calculate trate-influencer dot-product
    inf_dot_product = get_influencer_dot_product(post_result, influencer_dataframe)
    
    # Get the sum of influencer traits
    influencer_sum = influencer_dataframe.sum(axis=1)
    
    # Divide the dot product by the sum calculated above
    inf_dot_product = inf_dot_product.divide(influencer_sum)
    
    return inf_dot_product

# Trait accuracy - round the results
def natural_round(x: float) -> int:
    out = int(x // 1)
    return out + 1 if (x - out) >= 0.5 else out

def accuracy_per_trait(input_vector: pd.Series, annotated_vector: pd.Series) -> np.array:
    out_array = np.array([0] * 37, dtype=np.int)
    for i in range(len(out_array)):
        if natural_round(input_vector[i]) == annotated_vector[i]:
            out_array[i] = 1
    return out_array

In [15]:
# Create word map for both structures
word_map = word_df.columns.tolist()
softmax_word_map = softmax_word_df.columns.tolist()

In [16]:
pbar = tqdm(arch_df.iterrows())

# Out accuracy vector
total_accuracy = np.array([0] * 37, dtype=np.int)

for idx, row in pbar:
    user_text = list(itertools.chain.from_iterable(posts[users.index(idx)]))
    user_text = " ".join(user_text)
    sim_output = get_trait_dot_product(user_text, word_map, word_df)
    user_accuracy = accuracy_per_trait(sim_output, row)
    total_accuracy += user_accuracy
    pbar.set_description(f"Average accuracy: {round(np.mean(np.divide(total_accuracy, users.index(idx)+1))*100, 2)}")

Average accuracy: 15.94: : 685it [40:51,  3.58s/it]


# Softmax version

In [17]:
from scipy.special import softmax

def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    filtered_post += extract_hashtags(post_text)
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    word_ctr = Counter(filtered_post)
    
    for word, freq in word_ctr.items():
        if word in word_map:
            post_vector[word_map.index(word)] = freq
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    
    out_vec = pd.Series()
    for trait in trait_list:
        out_vec = out_vec.append(pd.Series([np.argmax(softmax(word_dot.loc[trait]))], index=[trait]))
    
    return out_vec

# Trait accuracy - round the results
def natural_round(x: float) -> int:
    out = int(x // 1)
    return out + 1 if (x - out) >= 0.5 else out

def accuracy_per_trait(input_vector: pd.Series, annotated_vector: pd.Series) -> np.array:
    out_array = np.array([0] * 37, dtype=np.int)
    for i in range(len(out_array)):
        if input_vector[i] == annotated_vector[i]:
            out_array[i] = 1
    return out_array

In [18]:
pbar = tqdm(arch_df.iterrows())
accuracy = 0

# Out accuracy vector
total_accuracy = np.array([0] * 37, dtype=np.int)

for idx, row in pbar:
    user_text = list(itertools.chain.from_iterable(posts[users.index(idx)]))
    user_text = " ".join(user_text)
    sim_output = get_trait_dot_product(user_text, softmax_word_map, softmax_word_df)
    user_accuracy = accuracy_per_trait(sim_output, row)
    total_accuracy += user_accuracy
    pbar.set_description(f"Average accuracy: {round(np.mean(np.divide(total_accuracy, users.index(idx)+1))*100, 2)}")

  out_vec = pd.Series()
Average accuracy: 44.16: : 685it [42:29,  3.72s/it]


In [19]:
# Test dataset

# Load the .csv with archetypes
arch_df = pd.read_csv('test_archetypes_pl.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

# Table preprocessing - replace all NaN with 2 (Unrelated/Don't know class), replace 0-5 values with the ones in range -1.0 - 1.0
arch_df = arch_df.fillna(2)

# Remove duplicated annotations, to exclude conflicting entries
arch_df = arch_df[~arch_df.index.duplicated(keep='first')]

# Print the head of the dataset after modification
arch_df.head()

# Check if a user has a non-empty directory in the dataset, otherwise delete the user from the list
available_arch_df = copy.deepcopy(arch_df)
posts = []

BASE_DIR = "instagram_cleared"

# Iterate over whole DataFrame
for i, row in tqdm(arch_df.iterrows()):
    profile_posts = []
    profile_hashtags = []
    
    # Get all posts per profile
    profile_path = os.path.join(BASE_DIR, i)
    for file in os.listdir(profile_path):
        if not file.endswith(".toml"):
            with open(os.path.join(profile_path, file), "r") as post_f:
                read_text = post_f.read()
                profile_posts.append(remove_stopwords(clean_up_text(read_text)))
                profile_hashtags.append(extract_hashtags(read_text))

    # Merge lists - a single list for a single influencer
    profile_hashtags = list(itertools.chain.from_iterable(profile_hashtags))
    posts.append(list(itertools.chain.from_iterable([profile_posts, [profile_hashtags]])))
    
# Map usernames to indices
users = list(arch_df.index.values)
user_indices = {k: users.index(k) for k in users}



4it [00:00, 33.14it/s]

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


177it [00:04, 40.95it/s]


In [20]:
pbar = tqdm(arch_df.iterrows())

# Out accuracy vector
test_total_accuracy = np.array([0] * 37, dtype=np.int)

for idx, row in pbar:
    profile_path = os.path.join(BASE_DIR, idx)
    user_text = ""
    for file in os.listdir(profile_path):
        if not file.endswith(".toml"):
            with open(os.path.join(profile_path, file), "r") as post_f:
                read_text = post_f.read()
                user_text += read_text
    sim_output = get_trait_dot_product(user_text, softmax_word_map, softmax_word_df)
    user_accuracy = accuracy_per_trait(sim_output, row)
    test_total_accuracy += user_accuracy
    pbar.set_description(f"Average test dataset accuracy: {round(np.mean(np.divide(test_total_accuracy, users.index(idx)+1))*100, 2)}")
    
# Show total accuracy
scaled_test_accuracy = np.divide(test_total_accuracy, len(arch_df))
avg_test_accuracy = np.mean(scaled_test_accuracy)

print("--- ACCURACY ON TESTING DATASET ---")

print(f"Average test dataset accuracy: {round(avg_test_accuracy*100, 2)}%")
print("Accuracy per trait:")
for i in range(len(trait_list)):
    print(f"{trait_list[i]}: {round(scaled_test_accuracy[i] * 100, 2)}%")

  out_vec = pd.Series()
Average test dataset accuracy: 37.64: : 177it [15:25,  5.23s/it]

--- ACCURACY ON TESTING DATASET ---
Average test dataset accuracy: 37.64%
Accuracy per trait:
innocent: 33.9%
sage: 29.94%
explorer: 30.51%
outlaw: 37.29%
magician: 36.72%
hero: 55.37%
lover: 45.76%
jester: 53.67%
everyman: 24.86%
caregiver: 36.16%
ruler: 48.02%
creator: 29.38%
dominant: 22.03%
submissive: 33.9%
maximalist: 19.77%
minimalist: 18.64%
inspiring: 31.07%
systematic: 33.9%
discovering: 48.02%
conservative: 42.94%
verifying: 19.21%
overlooking: 11.86%
sharpening: 54.24%
harmonic: 36.16%
empathic: 41.24%
matter_of_fact: 43.5%
brave: 64.97%
protective: 43.5%
generous: 25.42%
thrifty: 54.24%
favourable: 59.89%
balanced: 28.81%
sensuality: 42.94%
intelligent: 18.64%
believe: 49.72%
egocentric: 51.41%
allocentric: 35.03%





## Regression model - testing dataset

In [21]:
# Methods
def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    filtered_post += extract_hashtags(post_text)
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    word_ctr = Counter(filtered_post)
    
    for word, freq in word_ctr.items():
        if word in word_map:
            post_vector[word_map.index(word)] = freq
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    
    return word_dot

# Replace NaN with 0 in word_frequency_table
word_df = word_df.fillna(0)

# Method for calculating the dot product of trait <-> influencer relation
def get_influencer_dot_product(trait_output: list, influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    return influencer_dataframe.dot(trait_output)

# Method for calculating the similarity
def calculate_similarity(post_text: str, 
                         word_map: list, 
                         word_dataframe: pd.DataFrame,
                         influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    # Calculate word-trait dot product
    post_result = get_trait_dot_product(post_text, word_map, word_dataframe)
    
    # Calculate trate-influencer dot-product
    inf_dot_product = get_influencer_dot_product(post_result, influencer_dataframe)
    
    # Get the sum of influencer traits
    influencer_sum = influencer_dataframe.sum(axis=1)
    
    # Divide the dot product by the sum calculated above
    inf_dot_product = inf_dot_product.divide(influencer_sum)
    
    return inf_dot_product

# Trait accuracy - round the results
def natural_round(x: float) -> int:
    out = int(x // 1)
    return out + 1 if (x - out) >= 0.5 else out

def accuracy_per_trait(input_vector: pd.Series, annotated_vector: pd.Series) -> np.array:
    out_array = np.array([0] * 37, dtype=np.int)
    for i in range(len(out_array)):
        if natural_round(input_vector[i]) == annotated_vector[i]:
            out_array[i] = 1
    return out_array

In [22]:
pbar = tqdm(arch_df.iterrows())

# Out accuracy vector
test_reg_total_accuracy = np.array([0] * 37, dtype=np.int)

for idx, row in pbar:
    profile_path = os.path.join(BASE_DIR, idx)
    user_text = ""
    for file in os.listdir(profile_path):
        if not file.endswith(".toml"):
            with open(os.path.join(profile_path, file), "r") as post_f:
                read_text = post_f.read()
                user_text += read_text
    sim_output = get_trait_dot_product(user_text, word_map, word_df)
    user_accuracy = accuracy_per_trait(sim_output, row)
    test_reg_total_accuracy += user_accuracy
    pbar.set_description(f"Average test dataset accuracy: {round(np.mean(np.divide(test_reg_total_accuracy, users.index(idx)+1))*100, 2)}")
    
# Show total accuracy
scaled_reg_test_accuracy = np.divide(test_reg_total_accuracy, len(arch_df))
avg_reg_test_accuracy = np.mean(scaled_reg_test_accuracy)

print("--- ACCURACY ON TESTING DATASET ---")

print(f"Average test dataset accuracy: {round(avg_reg_test_accuracy*100, 2)}%")
print("Accuracy per trait:")
for i in range(len(trait_list)):
    print(f"{trait_list[i]}: {round(scaled_reg_test_accuracy[i] * 100, 2)}%")

Average test dataset accuracy: 18.08: : 177it [14:44,  5.00s/it]

--- ACCURACY ON TESTING DATASET ---
Average test dataset accuracy: 18.08%
Accuracy per trait:
innocent: 3.95%
sage: 10.17%
explorer: 33.33%
outlaw: 28.25%
magician: 20.9%
hero: 24.29%
lover: 30.51%
jester: 31.07%
everyman: 6.78%
caregiver: 31.07%
ruler: 8.47%
creator: 7.91%
dominant: 3.95%
submissive: 35.03%
maximalist: 2.82%
minimalist: 36.72%
inspiring: 8.47%
systematic: 7.34%
discovering: 16.38%
conservative: 22.03%
verifying: 3.39%
overlooking: 30.51%
sharpening: 31.07%
harmonic: 6.21%
empathic: 16.95%
matter_of_fact: 12.99%
brave: 31.07%
protective: 35.03%
generous: 4.52%
thrifty: 27.12%
favourable: 25.99%
balanced: 22.6%
sensuality: 2.26%
intelligent: 4.52%
believe: 7.34%
egocentric: 19.21%
allocentric: 18.64%



