## Modify the softmax structure - weight scaling
Try to differentiate the similar weights, TF-IDF-like approach.
If the word is similarly occuring in all classes, it has to have the weight lowered.

Otherwise, if the probability for one class is significantly higher than for the other ones,
the weight is increased by a significant margin

In [1]:
# Load the structure
import pickle
import pandas as pd
import dask.dataframe as dd
from tqdm import tqdm
import numpy as np
import copy
import os
import toml
import re
import itertools
from text_cleaner import *
import operator
from collections import Counter

def extract_hashtags(post_text):
    HASH_RE = re.compile(r"\#\w+")
    out_list = re.findall(HASH_RE, post_text)
    return out_list

with open("softmax_full_word_trait_array.pickle", "rb") as f:
    word_df = pickle.load(f)

In [2]:
# Show the loaded DataFrame
word_df

Unnamed: 0,Unnamed: 1,zainspirowany,wczorajszym,wywiadem,odnośnie,relacji,chciałem,przekazać,okres,kwarantanny,świetny,...,#piekewdomu,#dish,#kremzcukinii,#zucchina,#pranzoitaliano,#tradycyjnejedzenie,#tatar,#meatlover,#zachcianki,#pierogizkapusta
innocent,0,0.000021,0.000043,2.125173e-05,0.000043,0.000149,0.000106,0.000064,0.000149,0.000149,0.000128,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
innocent,1,0.000000,0.000013,0.000000e+00,0.000013,0.000243,0.000013,0.000013,0.000040,0.000040,0.000013,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
innocent,2,0.000002,0.000019,0.000000e+00,0.000045,0.000112,0.000074,0.000045,0.000100,0.000045,0.000095,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
innocent,3,0.000006,0.000034,9.158188e-07,0.000037,0.000173,0.000026,0.000030,0.000132,0.000039,0.000071,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
innocent,4,0.000000,0.000016,0.000000e+00,0.000055,0.000079,0.000008,0.000040,0.000079,0.000047,0.000040,...,0.000040,0.000016,0.000008,0.000016,0.000008,0.000008,0.000008,0.000008,0.000008,0.000008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
allocentric,0,0.000016,0.000033,3.259984e-05,0.000049,0.000228,0.000081,0.000081,0.000081,0.000065,0.000081,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
allocentric,1,0.000009,0.000033,0.000000e+00,0.000033,0.000164,0.000021,0.000021,0.000270,0.000048,0.000061,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
allocentric,2,0.000000,0.000021,0.000000e+00,0.000036,0.000088,0.000047,0.000031,0.000109,0.000026,0.000109,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
allocentric,3,0.000005,0.000027,0.000000e+00,0.000046,0.000145,0.000046,0.000028,0.000085,0.000042,0.000076,...,0.000006,0.000002,0.000001,0.000002,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001


In [3]:
# Get all traits/archetypes
trait_list = word_df.index.values
trait_list = list(dict.fromkeys([x[0] for x in trait_list]))
print(trait_list)

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


In [5]:
# Normalization method
def min_max_normalize(df_to_normalize):
    normalized_df = df_to_normalize
    cols = normalized_df.columns.tolist()
    for col in tqdm(cols):
        normalized_df[col] = (normalized_df[col] - normalized_df[col].min()) / (normalized_df[col].max() - normalized_df[col].min())
    return normalized_df

norm_list = []
for trait in trait_list:
    print(f"Trait: {trait}")
    normalized_trait = min_max_normalize(copy.deepcopy(word_df.loc[trait]))
    tmp_trait = normalized_trait
    norm_list.append(tmp_trait)
    
norm_word_df = pd.concat(norm_list, keys=trait_list)

  0%|          | 347/225423 [00:00<02:04, 1800.70it/s]

Trait: innocent


100%|██████████| 225423/225423 [01:44<00:00, 2164.71it/s]
  0%|          | 218/225423 [00:00<01:43, 2172.76it/s]

Trait: sage


100%|██████████| 225423/225423 [01:43<00:00, 2175.86it/s]
  0%|          | 412/225423 [00:00<01:48, 2068.53it/s]

Trait: explorer


100%|██████████| 225423/225423 [01:45<00:00, 2146.04it/s]
  0%|          | 450/225423 [00:00<01:44, 2153.69it/s]

Trait: outlaw


100%|██████████| 225423/225423 [01:44<00:00, 2158.54it/s]
  0%|          | 440/225423 [00:00<01:44, 2159.07it/s]

Trait: magician


100%|██████████| 225423/225423 [01:43<00:00, 2174.18it/s]
  0%|          | 448/225423 [00:00<01:42, 2191.73it/s]

Trait: hero


100%|██████████| 225423/225423 [01:42<00:00, 2198.41it/s]
  0%|          | 227/225423 [00:00<01:39, 2266.10it/s]

Trait: lover


100%|██████████| 225423/225423 [01:41<00:00, 2221.12it/s]
  0%|          | 444/225423 [00:00<01:41, 2214.89it/s]

Trait: jester


100%|██████████| 225423/225423 [01:40<00:00, 2245.74it/s]
  0%|          | 440/225423 [00:00<01:43, 2175.43it/s]

Trait: everyman


100%|██████████| 225423/225423 [01:40<00:00, 2244.27it/s]
  0%|          | 221/225423 [00:00<01:42, 2206.15it/s]

Trait: caregiver


100%|██████████| 225423/225423 [01:40<00:00, 2251.70it/s]
  0%|          | 460/225423 [00:00<01:39, 2259.26it/s]

Trait: ruler


100%|██████████| 225423/225423 [01:39<00:00, 2255.59it/s]
  0%|          | 235/225423 [00:00<01:36, 2345.29it/s]

Trait: creator


100%|██████████| 225423/225423 [01:39<00:00, 2255.05it/s]
  0%|          | 228/225423 [00:00<01:38, 2279.49it/s]

Trait: dominant


100%|██████████| 225423/225423 [01:39<00:00, 2255.58it/s]
  0%|          | 468/225423 [00:00<01:39, 2264.91it/s]

Trait: submissive


100%|██████████| 225423/225423 [01:40<00:00, 2252.16it/s]
  0%|          | 226/225423 [00:00<01:39, 2253.96it/s]

Trait: maximalist


100%|██████████| 225423/225423 [01:40<00:00, 2252.16it/s]
  0%|          | 230/225423 [00:00<01:38, 2296.75it/s]

Trait: minimalist


100%|██████████| 225423/225423 [01:40<00:00, 2251.28it/s]
  0%|          | 445/225423 [00:00<01:40, 2232.69it/s]

Trait: inspiring


100%|██████████| 225423/225423 [01:40<00:00, 2239.03it/s]
  0%|          | 464/225423 [00:00<01:40, 2239.83it/s]

Trait: systematic


100%|██████████| 225423/225423 [01:43<00:00, 2172.21it/s]
  0%|          | 415/225423 [00:00<01:48, 2071.33it/s]

Trait: discovering


100%|██████████| 225423/225423 [01:46<00:00, 2111.10it/s]
  0%|          | 225/225423 [00:00<01:40, 2241.89it/s]

Trait: conservative


100%|██████████| 225423/225423 [01:42<00:00, 2199.86it/s]
  0%|          | 217/225423 [00:00<01:43, 2168.46it/s]

Trait: verifying


100%|██████████| 225423/225423 [01:45<00:00, 2129.92it/s]
  0%|          | 444/225423 [00:00<01:41, 2207.01it/s]

Trait: overlooking


100%|██████████| 225423/225423 [01:42<00:00, 2199.60it/s]
  0%|          | 436/225423 [00:00<01:43, 2169.13it/s]

Trait: sharpening


100%|██████████| 225423/225423 [01:43<00:00, 2183.88it/s]
  0%|          | 234/225423 [00:00<01:36, 2332.13it/s]

Trait: harmonic


100%|██████████| 225423/225423 [01:48<00:00, 2076.15it/s]
  0%|          | 206/225423 [00:00<01:50, 2032.86it/s]

Trait: empathic


100%|██████████| 225423/225423 [01:49<00:00, 2066.27it/s]
  0%|          | 207/225423 [00:00<01:48, 2067.99it/s]

Trait: matter_of_fact


100%|██████████| 225423/225423 [01:53<00:00, 1984.11it/s]
  0%|          | 414/225423 [00:00<01:48, 2064.41it/s]

Trait: brave


100%|██████████| 225423/225423 [01:51<00:00, 2016.89it/s]
  0%|          | 204/225423 [00:00<01:50, 2035.29it/s]

Trait: protective


100%|██████████| 225423/225423 [01:51<00:00, 2027.52it/s]
  0%|          | 208/225423 [00:00<01:48, 2073.48it/s]

Trait: generous


100%|██████████| 225423/225423 [01:51<00:00, 2023.34it/s]
  0%|          | 419/225423 [00:00<01:49, 2058.42it/s]

Trait: thrifty


100%|██████████| 225423/225423 [01:54<00:00, 1968.28it/s]
  0%|          | 200/225423 [00:00<01:53, 1992.54it/s]

Trait: favourable


100%|██████████| 225423/225423 [01:55<00:00, 1957.87it/s]
  0%|          | 50/225423 [00:00<07:32, 498.14it/s]

Trait: balanced


100%|██████████| 225423/225423 [01:56<00:00, 1934.53it/s]
  0%|          | 412/225423 [00:00<01:50, 2030.89it/s]

Trait: sensuality


100%|██████████| 225423/225423 [01:53<00:00, 1989.69it/s]
  0%|          | 390/225423 [00:00<01:55, 1946.31it/s]

Trait: intelligent


100%|██████████| 225423/225423 [01:54<00:00, 1967.44it/s]
  0%|          | 210/225423 [00:00<01:47, 2094.40it/s]

Trait: believe


100%|██████████| 225423/225423 [01:51<00:00, 2016.15it/s]
  0%|          | 200/225423 [00:00<01:52, 1995.51it/s]

Trait: egocentric


100%|██████████| 225423/225423 [01:48<00:00, 2083.02it/s]
  0%|          | 231/225423 [00:00<01:37, 2305.18it/s]

Trait: allocentric


100%|██████████| 225423/225423 [01:45<00:00, 2130.09it/s]


In [6]:
norm_word_df

Unnamed: 0,Unnamed: 1,zainspirowany,wczorajszym,wywiadem,odnośnie,relacji,chciałem,przekazać,okres,kwarantanny,świetny,...,#piekewdomu,#dish,#kremzcukinii,#zucchina,#pranzoitaliano,#tradycyjnejedzenie,#tatar,#meatlover,#zachcianki,#pierogizkapusta
innocent,0,1.000000,1.000000,1.000000,0.693486,0.426116,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
innocent,1,0.000000,0.000000,0.000000,0.000000,1.000000,0.056698,0.000000,0.000000,0.009714,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
innocent,2,0.111937,0.191227,0.000000,0.757875,0.200239,0.669417,0.630885,0.549016,0.053189,0.716246,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
innocent,3,0.301657,0.703053,0.043094,0.553207,0.574807,0.180353,0.332996,0.844126,0.000000,0.508228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
innocent,4,0.000000,0.080213,0.000000,1.000000,0.000000,0.000000,0.517986,0.356370,0.073563,0.228374,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
allocentric,0,1.000000,0.942079,1.000000,1.000000,1.000000,1.000000,1.000000,0.123193,1.000000,0.459871,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
allocentric,1,0.557682,1.000000,0.000000,0.322751,0.539284,0.076809,0.000000,1.000000,0.574663,0.045621,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
allocentric,2,0.000000,0.000000,0.000000,0.449722,0.000000,0.465693,0.163556,0.250160,0.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
allocentric,3,0.283110,0.461321,0.000000,0.880241,0.409001,0.458651,0.107442,0.141233,0.397878,0.353671,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
from scipy.special import softmax

def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    filtered_post += extract_hashtags(post_text)
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    word_ctr = Counter(filtered_post)
    
    for word, freq in word_ctr.items():
        if word in word_map:
            post_vector[word_map.index(word)] = freq
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    
    out_vec = pd.Series()
    for trait in trait_list:
        out_vec = out_vec.append(pd.Series([np.argmax(softmax(word_dot.loc[trait]))], index=[trait]))
    
    return out_vec

# Trait accuracy - round the results
def natural_round(x: float) -> int:
    out = int(x // 1)
    return out + 1 if (x - out) >= 0.5 else out

def accuracy_per_trait(input_vector: pd.Series, annotated_vector: pd.Series) -> np.array:
    out_array = np.array([0] * 37, dtype=np.int)
    for i in range(len(out_array)):
        if input_vector[i] == annotated_vector[i]:
            out_array[i] = 1
    return out_array

In [8]:
# Load the .csv with archetypes
arch_df = pd.read_csv('archetypes_pl_new.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [9]:
# Table preprocessing - replace all NaN with 2 (Unrelated/Don't know class), replace 0-5 values with the ones in range -1.0 - 1.0
arch_df = arch_df.fillna(2)

# Remove duplicated annotations, to exclude conflicting entries
arch_df = arch_df[~arch_df.index.duplicated(keep='first')]

# Print the head of the dataset after modification
arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [10]:
# Check if a user has a non-empty directory in the dataset, otherwise delete the user from the list
available_arch_df = copy.deepcopy(arch_df)
posts = []

BASE_DIR = "instagram_cleared"

# Iterate over whole DataFrame
for i, row in tqdm(arch_df.iterrows()):
    profile_posts = []
    profile_hashtags = []
    
    # Get all posts per profile
    profile_path = os.path.join(BASE_DIR, i)
    for file in os.listdir(profile_path):
        if not file.endswith(".toml"):
            with open(os.path.join(profile_path, file), "r") as post_f:
                read_text = post_f.read()
                profile_posts.append(remove_stopwords(clean_up_text(read_text)))
                profile_hashtags.append(extract_hashtags(read_text))

    # Merge lists - a single list for a single influencer
    profile_hashtags = list(itertools.chain.from_iterable(profile_hashtags))
    posts.append(list(itertools.chain.from_iterable([profile_posts, [profile_hashtags]])))

685it [00:20, 34.13it/s]


In [11]:
# Map usernames to indices
users = list(available_arch_df.index.values)
user_indices = {k: users.index(k) for k in users}

In [12]:
# Create word map
norm_word_map = norm_word_df.columns.tolist()

In [13]:
pbar = tqdm(arch_df.iterrows())
accuracy = 0

# Out accuracy vector
total_accuracy = np.array([0] * 37, dtype=np.int)

for idx, row in pbar:
    user_text = list(itertools.chain.from_iterable(posts[users.index(idx)]))
    user_text = " ".join(user_text)
    sim_output = get_trait_dot_product(user_text, norm_word_map, norm_word_df)
    user_accuracy = accuracy_per_trait(sim_output, row)
    total_accuracy += user_accuracy
    pbar.set_description(f"Average accuracy: {round(np.mean(np.divide(total_accuracy, users.index(idx)+1))*100, 2)}")

  out_vec = pd.Series()
Average accuracy: 97.8: : 685it [20:23,  1.79s/it] 


In [14]:
# Show total accuracy
scaled_accuracy = np.divide(total_accuracy, len(arch_df))
avg_accuracy = np.mean(scaled_accuracy)

print("--- ACCURACY ON WHOLE DATASET ---")

print(f"Average dataset accuracy: {round(avg_accuracy*100, 2)}%")
print("Accuracy per trait:")
for i in range(len(trait_list)):
    print(f"{trait_list[i]}: {round(scaled_accuracy[i] * 100, 2)}%")

--- ACCURACY ON WHOLE DATASET ---
Average dataset accuracy: 97.8%
Accuracy per trait:
innocent: 98.54%
sage: 97.52%
explorer: 97.66%
outlaw: 99.56%
magician: 99.56%
hero: 98.98%
lover: 98.69%
jester: 97.96%
everyman: 96.06%
caregiver: 98.39%
ruler: 96.64%
creator: 97.96%
dominant: 94.6%
submissive: 98.25%
maximalist: 97.37%
minimalist: 97.52%
inspiring: 99.27%
systematic: 95.47%
discovering: 97.37%
conservative: 99.85%
verifying: 95.18%
overlooking: 96.35%
sharpening: 98.54%
harmonic: 98.83%
empathic: 99.27%
matter_of_fact: 94.01%
brave: 98.25%
protective: 98.1%
generous: 99.27%
thrifty: 99.42%
favourable: 98.25%
balanced: 98.98%
sensuality: 98.1%
intelligent: 95.18%
believe: 97.08%
egocentric: 98.54%
allocentric: 98.1%


In [16]:
# Save AGDS to file
norm_word_df.to_pickle("softmax_norm_full_word_trait_array.pickle")