In [1]:
# Associate words with archetypes/character traits as intermediate layer
# and with influencer as the "last" layer

# Dependencies
import pandas as pd
import dask.dataframe as dd
import numpy as np
from tqdm.notebook import trange, tqdm
import copy
import os
import toml
import re
import itertools
from text_cleaner import *
import operator
from collections import Counter
import pickle

def extract_hashtags(post_text):
    HASH_RE = re.compile(r"\#\w+")
    out_list = re.findall(HASH_RE, post_text)
    return out_list

In [2]:
# Load the .csv with archetypes
arch_df = pd.read_csv('archetypes_pl_new.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [3]:
# Table preprocessing - replace all NaN with 2 (Unrelated/Don't know class), replace 0-5 values with the ones in range -1.0 - 1.0
arch_df = arch_df.fillna(2)

# Remove duplicated annotations, to exclude conflicting entries
arch_df = arch_df[~arch_df.index.duplicated(keep='first')]

# Print the head of the dataset after modification
arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [4]:
# Split dataset into train/test, in 75-25% proportion
train_df = arch_df.sample(frac=0.9, random_state=42)
test_df = arch_df.drop(train_df.index).sample(frac=1.0)

BASE_DIR = "instagram_cleared"

def generate_dataset(dataset_frame):
    posts = []

    # Iterate over whole DataFrame
    for i, row in tqdm(dataset_frame.iterrows()):
        profile_posts = []
        profile_hashtags = []

        # Get all posts per profile
        profile_path = os.path.join(BASE_DIR, i)
        for file in os.listdir(profile_path):
            if not file.endswith(".toml"):
                with open(os.path.join(profile_path, file), "r") as post_f:
                    read_text = post_f.read()
                    profile_posts.append(remove_stopwords(clean_up_text(read_text)))
                    profile_hashtags.append(extract_hashtags(read_text))

        # Merge lists - a single list for a single influencer
        profile_hashtags = list(itertools.chain.from_iterable(profile_hashtags))
        posts.append(list(itertools.chain.from_iterable([profile_posts, [profile_hashtags]])))
    return posts

train_posts = generate_dataset(train_df)
test_posts = generate_dataset(test_df)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [5]:
# Map training usernames to indices
users = list(train_df.index.values)
user_indices = {k: users.index(k) for k in users}

In [6]:
# AGDS - discrete approach

def merge_dicts(dict_a, dict_b) -> dict:
    out_dict = dict_a
    for k, v in dict_b.items():
        if k in out_dict.keys():
            out_dict[k] += v
        else:
            out_dict[k] = v
    return out_dict

def min_max_normalize(df_to_normalize):
    # Normalize per indices
    normalized_df = df_to_normalize    
    for row in tqdm(range(5), position=1):
        normalized_df.iloc[row] = (normalized_df.iloc[row] - normalized_df.iloc[row].min()) / (
            normalized_df.iloc[row].max() - normalized_df.iloc[row].min())
    return normalized_df
    

# Iterate over all of the traits/archetypes
word_set = set()
trait_df_list = []
for trait in tqdm(trait_list, position=0):
    # Select influencers which have the given archetype annotated
    subset_df = train_df[trait]
    trait_subframe = pd.DataFrame()
    for trait_class in range(5):
        class_df = subset_df.loc[subset_df == trait_class]
        subset_indices = [user_indices[idx] for idx in class_df.index.values]

        # Get all posts for the list of influencers
        f = operator.itemgetter(*subset_indices)
        sublist = list(f(train_posts))

        # Counter to calculate each word occurrences
        trait_total = 0
        out_dict = {}
        for i, post_set in enumerate(sublist):
            trait_ctr = Counter(itertools.chain.from_iterable(post_set))
            trait_total += sum(trait_ctr.values())
            out_dict = merge_dicts(out_dict, trait_ctr)
        out_dict = {k: float(v / trait_total) for k, v in out_dict.items()}
        word_set.update(out_dict.keys())
        trait_ctr = {trait_class: out_dict}
        trait_tmp_df = pd.DataFrame.from_dict(trait_ctr, orient="index")
        trait_subframe = trait_subframe.append(trait_tmp_df)

    # Append the new dataframe
    #word_df = word_df.append(trait_subframe)
    
    trait_subframe = min_max_normalize(trait_subframe)
    trait_df_list.append(trait_subframe)
    
softmax_word_df = pd.concat(trait_df_list, keys=trait_list)

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
# Check the calculation results
softmax_word_df

Unnamed: 0,Unnamed: 1,good,morning,wish,you,nice,relaxing,thursday,saturday,friday,start,...,#używane,#podzielnia,#ekobiuro,#goHi2020,#hackaton,#jachranka,#greenladies,#greenguys,#polishheroes,#bestgifts
innocent,0,0.435185,0.194444,0.314815,0.611111,0.342593,0.185185,0.037037,0.027778,0.009259,0.120370,...,,,,,,,,,,
innocent,1,0.074286,0.068571,0.000000,0.605714,0.000000,,,,0.022857,0.051429,...,,,,,,,,,,
innocent,2,0.052790,0.013575,0.004525,0.840121,0.021116,0.000000,0.000000,0.003017,0.007541,0.064857,...,,,,,,,,,,
innocent,3,0.032024,0.015250,0.004575,0.203965,0.013343,0.000000,0.001525,0.004575,0.008387,0.021350,...,,,,,,,,,,
innocent,4,0.007326,0.000000,,0.073260,0.007326,,,,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
allocentric,0,0.045226,0.040201,,0.291457,0.015075,,,0.005025,0.000000,0.005025,...,,,,,,,,,,
allocentric,1,0.111724,0.038621,0.052414,0.477241,0.068966,0.028966,0.012414,0.008276,0.009655,0.034483,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
allocentric,2,0.034682,0.026012,0.000000,0.141618,0.002890,,,,0.002890,0.023121,...,,,,,,,,,,
allocentric,3,0.024723,0.015136,0.004541,0.189707,0.014127,,,0.004036,0.010091,0.027750,...,,,,,,,,,,


In [8]:
# Fill NaN values with 0
softmax_word_df = softmax_word_df.fillna(0)

softmax_word_df.to_pickle("agds_structures/class_x_normalized_agds.pickle")

In [9]:
# Create word map for softmax structure
softmax_word_map = softmax_word_df.columns.tolist()

In [10]:
from scipy.special import softmax

def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    filtered_post += extract_hashtags(post_text)
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    word_ctr = Counter(filtered_post)
    
    for word, freq in word_ctr.items():
        if word in word_map:
            post_vector[word_map.index(word)] = freq
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    
    out_vec = pd.Series()
    for trait in trait_list:
        out_vec = out_vec.append(pd.Series([np.argmax(softmax(word_dot.loc[trait]))], index=[trait]))
    
    return out_vec

# Trait accuracy - round the results
def natural_round(x: float) -> int:
    out = int(x // 1)
    return out + 1 if (x - out) >= 0.5 else out

def accuracy_per_trait(input_vector: pd.Series, annotated_vector: pd.Series) -> np.array:
    out_array = np.array([0] * 37, dtype=np.int)
    for i in range(len(out_array)):
        if input_vector[i] == annotated_vector[i]:
            out_array[i] = 1
    return out_array

In [11]:
pbar = tqdm(train_df.iterrows())
accuracy = 0

# Out accuracy vector
total_accuracy = np.array([0] * 37, dtype=np.int)

for idx, row in pbar:
    user_text = list(itertools.chain.from_iterable(train_posts[users.index(idx)]))
    user_text = " ".join(user_text)
    sim_output = get_trait_dot_product(user_text, softmax_word_map, softmax_word_df)
    user_accuracy = accuracy_per_trait(sim_output, row)
    total_accuracy += user_accuracy
    pbar.set_description(f"Average accuracy: {round(np.mean(np.divide(total_accuracy, users.index(idx)+1))*100, 2)}")

0it [00:00, ?it/s]

  out_vec = pd.Series()


In [12]:
# Show total accuracy
scaled_train_accuracy = np.divide(total_accuracy, len(train_df))
avg_train_accuracy = np.mean(scaled_train_accuracy)

print("--- ACCURACY ON TRAINING DATASET ---")

print(f"Average train dataset accuracy: {round(avg_train_accuracy*100, 2)}%")
print("Accuracy per trait:")
for i in range(len(trait_list)):
    print(f"{trait_list[i]}: {round(scaled_train_accuracy[i] * 100, 2)}%")

--- ACCURACY ON TRAINING DATASET ---
Average train dataset accuracy: 44.76%
Accuracy per trait:
innocent: 34.25%
sage: 31.17%
explorer: 68.02%
outlaw: 46.27%
magician: 46.1%
hero: 40.91%
lover: 32.95%
jester: 41.88%
everyman: 55.84%
caregiver: 43.18%
ruler: 46.59%
creator: 57.14%
dominant: 42.05%
submissive: 34.42%
maximalist: 64.29%
minimalist: 51.79%
inspiring: 36.36%
systematic: 32.79%
discovering: 64.29%
conservative: 45.62%
verifying: 71.75%
overlooking: 33.6%
sharpening: 30.03%
harmonic: 25.16%
empathic: 37.01%
matter_of_fact: 39.45%
brave: 47.08%
protective: 48.38%
generous: 29.22%
thrifty: 54.06%
favourable: 36.36%
balanced: 51.14%
sensuality: 51.46%
intelligent: 58.6%
believe: 53.57%
egocentric: 39.94%
allocentric: 33.44%


In [13]:
# Set up environment for test dataset
test_users = list(test_df.index.values)
test_user_indices = {k: test_users.index(k) for k in test_users}

In [14]:
pbar = tqdm(test_df.iterrows())
accuracy = 0

# Out accuracy vector
test_total_accuracy = np.array([0] * 37, dtype=np.int)

for idx, row in pbar:
    user_text = list(itertools.chain.from_iterable(test_posts[test_users.index(idx)]))
    user_text = " ".join(user_text)
    sim_output = get_trait_dot_product(user_text, softmax_word_map, softmax_word_df)
    user_accuracy = accuracy_per_trait(sim_output, row)
    test_total_accuracy += user_accuracy
    pbar.set_description(f"Average accuracy: {round(np.mean(np.divide(test_total_accuracy, test_users.index(idx)+1))*100, 2)}")

0it [00:00, ?it/s]

  out_vec = pd.Series()


In [15]:
# Show total accuracy
scaled_test_accuracy = np.divide(test_total_accuracy, len(test_df))
avg_test_accuracy = np.mean(scaled_test_accuracy)

print("--- ACCURACY ON TESTING DATASET ---")

print(f"Average test dataset accuracy: {round(avg_test_accuracy*100, 2)}%")
print("Accuracy per trait:")
for i in range(len(trait_list)):
    print(f"{trait_list[i]}: {round(scaled_test_accuracy[i] * 100, 2)}%")

--- ACCURACY ON TESTING DATASET ---
Average test dataset accuracy: 20.21%
Accuracy per trait:
innocent: 18.84%
sage: 18.84%
explorer: 31.88%
outlaw: 20.29%
magician: 24.64%
hero: 13.04%
lover: 11.59%
jester: 18.84%
everyman: 33.33%
caregiver: 27.54%
ruler: 20.29%
creator: 23.19%
dominant: 23.19%
submissive: 10.14%
maximalist: 36.23%
minimalist: 11.59%
inspiring: 13.04%
systematic: 17.39%
discovering: 40.58%
conservative: 15.94%
verifying: 59.42%
overlooking: 17.39%
sharpening: 4.35%
harmonic: 11.59%
empathic: 8.7%
matter_of_fact: 18.84%
brave: 14.49%
protective: 13.04%
generous: 10.14%
thrifty: 14.49%
favourable: 10.14%
balanced: 24.64%
sensuality: 17.39%
intelligent: 36.23%
believe: 30.43%
egocentric: 11.59%
allocentric: 14.49%
