In [62]:
# Associate words with archetypes/character traits as intermediate layer
# and with influencer as the "last" layer

# Dependencies
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import os
import toml
import re
import itertools
from text_cleaner import *
import operator
from collections import Counter
import pickle

def extract_hashtags(post_text):
    HASH_RE = re.compile(r"\#\w+")
    out_list = re.findall(HASH_RE, post_text)
    return out_list

In [63]:
# Load the .csv with archetypes
arch_df = pd.read_csv('archetypes_pl.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [64]:
# Table preprocessing - replace all NaN with 2 (Unrelated/Don't know class), replace 0-5 values with the ones in range -1.0 - 1.0
arch_df = arch_df.fillna(2.0)

arch_df = arch_df.replace(0.0, -1.0)
arch_df = arch_df.replace(1.0, -0.5)
arch_df = arch_df.replace(2.0, 0.0)
arch_df = arch_df.replace(3.0, 0.5)
arch_df = arch_df.replace(4.0, 1.0)

# Remove duplicated annotations, to exclude conflicting entries
arch_df = arch_df[~arch_df.index.duplicated(keep='first')]

# Print the head of the dataset after modification
arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,...,0.0,0.5,1.0,1.0,0.5,1.0,1.0,0.5,-1.0,-1.0
vege_style_life,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.0,-1.0,0.5
oliwka__2007,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,0.0,0.0,-1.0,0.5,-0.5,0.0,1.0,-0.5,-1.0,0.5
z_przestrzeni_serca,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,0.5,-1.0,1.0,1.0,0.5,1.0,1.0,-1.0,-0.5
zaradne_warsztaty,0.5,-1.0,-1.0,-1.0,0.5,-1.0,-1.0,0.0,0.5,1.0,...,0.5,1.0,-1.0,0.0,0.0,1.0,0.0,0.5,-0.5,0.5


In [65]:
# Check if a user has a non-empty directory in the dataset, otherwise delete the user from the list
available_arch_df = copy.deepcopy(arch_df)
posts = []

BASE_DIR = "instagram_cleared"

# Iterate over whole DataFrame
for i, row in tqdm(arch_df.iterrows()):
    profile_posts = []
    profile_hashtags = []
    
    # Get all posts per profile
    profile_path = os.path.join(BASE_DIR, i)
    for file in os.listdir(profile_path):
        if not file.endswith(".toml"):
            with open(os.path.join(profile_path, file), "r") as post_f:
                read_text = post_f.read()
                profile_posts.append(remove_stopwords(clean_up_text(read_text)))
                profile_hashtags.append(extract_hashtags(read_text))

    # Merge lists - a single list for a single influencer
    profile_hashtags = list(itertools.chain.from_iterable(profile_hashtags))
    posts.append(list(itertools.chain.from_iterable([profile_posts, [profile_hashtags]])))

508it [00:11, 44.79it/s]


In [66]:
# Map usernames to indices
users = list(available_arch_df.index.values)
user_indices = {k: users.index(k) for k in users}

In [67]:
# Get the word count and create a dataframe, where columns are archetypes/traits, and rows are single words
# Initialize a word DataFrame
word_df = pd.DataFrame()

def merge_dicts(dict_a, dict_b) -> dict:
    out_dict = dict_a
    for k, v in dict_b.items():
        if k in out_dict.keys():
            out_dict[k] += v
        else:
            out_dict[k] = v
    return out_dict

# Iterate over all of the traits/archetypes
for trait in tqdm(trait_list):
    # Select influencers which have the given archetype annotated
    subset_df = arch_df[available_arch_df[trait] != 0][trait]
    subset_indices = [user_indices[idx] for idx in subset_df.index.values]
    trait_weights = subset_df.tolist()
    
    # Get all posts for the list of influencers
    f = operator.itemgetter(*subset_indices)
    sublist = list(f(posts))
    
    # Counter to calculate each word occurrences
    trait_total = 0
    out_dict = {}
    for i, post_set in enumerate(sublist):
        trait_ctr = Counter(itertools.chain.from_iterable(post_set))
        trait_total += sum(trait_ctr.values())
        for key in trait_ctr:
            trait_ctr[key] *= trait_weights[i]
        out_dict = merge_dicts(out_dict, trait_ctr)
    out_dict = {k: float(v / trait_total) for k, v in out_dict.items()}
    trait_ctr = {trait: out_dict}
    
    # Append the new dataframe
    tmp_df = pd.DataFrame.from_dict(trait_ctr, orient="index")
    word_df = word_df.append(tmp_df)

100%|██████████| 37/37 [04:43<00:00,  7.67s/it]


In [68]:
# Check the calculation results
word_df

Unnamed: 0,zainspirowany,wczorajszym,wywiadem,odnośnie,relacji,chciałem,przekazać,okres,kwarantanny,świetny,...,#suplement,#iifym,#sportwater,#vitamind,#danio,#zdrowieciasto,#cocunut,#schoko,#żywieniewsporcie,#wysiłekfizyczny
innocent,1.07741e-06,9.157983e-06,-5.387049e-07,1.7e-05,6.7e-05,-1.07741e-06,8.619278e-06,2.4e-05,1.400633e-05,2.370301e-05,...,,,,,,,,,,
sage,1.25678e-06,3.770341e-06,0.0,1.8e-05,4.5e-05,4.398731e-06,1.131102e-05,2.9e-05,1.25678e-05,2.764917e-05,...,,,,,,,,,,
explorer,-1.998596e-06,-3.997191e-06,-1.998596e-06,-5e-06,-2.1e-05,-1.332397e-06,-1.998596e-06,-5e-06,0.0,9.992978e-06,...,,,,,,,,,,
outlaw,7.50522e-07,-9.756786e-06,-3.002088e-06,-2.3e-05,-0.00011,-1.275887e-05,-1.200835e-05,-3.9e-05,-1.425992e-05,-3.602506e-05,...,,,,,,,,,,
magician,-7.756917e-07,-3.102767e-06,-2.327075e-06,3e-06,-1.3e-05,-7.756917e-07,-7.756917e-07,-1.6e-05,5.429842e-06,-7.756917e-07,...,,,,,,,,,,
hero,-2.36152e-06,-3.935867e-06,-3.148694e-06,-2e-05,-9.5e-05,1.338195e-05,-7.084561e-06,0.0,-1.259478e-05,-1.653064e-05,...,,,,,,,,,,
lover,-1.246103e-06,-6.853565e-06,-2.492206e-06,-1.9e-05,-2.4e-05,-8.099668e-06,-6.853565e-06,-2.6e-05,-4.36136e-06,6.230514e-07,...,,,,,,,,,,
jester,1.459183e-06,4.377549e-06,2.188775e-06,-4e-06,7e-06,8.755098e-06,5.107141e-06,-6e-06,3.647958e-06,-1.094387e-05,...,,,,,,,,,,
everyman,5.164414e-07,1.49768e-05,0.0,2.1e-05,8.6e-05,5.164414e-06,8.779504e-06,3.9e-05,2.478919e-05,3.821667e-05,...,,,,,,,,,,
caregiver,-6.527739e-07,2.611096e-06,-2.611096e-06,1.3e-05,1e-05,1.305548e-06,3.916643e-06,2.4e-05,-6.527739e-07,2.284709e-05,...,,,,,,,,,,


In [69]:
# Fill NaN values with 0
word_df = word_df.fillna(0)

In [70]:
# Save the result to a pickle
import pickle

with open("influencer_index_map.pickle", "wb") as f:
    pickle.dump(user_indices, f)
    
word_df.to_pickle("word_trait_array.pickle")

In [71]:
# Create word map and save it to a pickle (this file must be loaded in pair with word_trait_array)
word_map = word_df.columns.tolist()

with open("word_map.pickle", "wb") as f:
    pickle.dump(word_map, f)

In [76]:
def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    filtered_post += extract_hashtags(post_text)
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    word_ctr = Counter(filtered_post)
    
    for word, freq in word_ctr.items():
        if word in word_map:
            post_vector[word_map.index(word)] = freq
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    return word_dot

# Replace NaN with 0 in word_frequency_table
word_df = word_df.fillna(0)

# Method for calculating the dot product of trait <-> influencer relation
def get_influencer_dot_product(trait_output: list, influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    return influencer_dataframe.dot(trait_output)

# Method for calculating the similarity
def calculate_similarity(post_text: str, 
                         word_map: list, 
                         word_dataframe: pd.DataFrame,
                         influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    # Calculate word-trait dot product
    post_result = get_trait_dot_product(post_text, word_map, word_dataframe)
    
    # Calculate trate-influencer dot-product
    inf_dot_product = get_influencer_dot_product(post_result, influencer_dataframe)
    
    # Get the sum of influencer traits
    influencer_sum = influencer_dataframe.sum(axis=1)
    
    # Divide the dot product by the sum calculated above
    inf_dot_product = inf_dot_product.divide(influencer_sum)
    
    return inf_dot_product

In [59]:
pbar = tqdm(available_arch_df.iterrows())
accuracy = 0

for idx, row in pbar:
    user_text = list(itertools.chain.from_iterable(posts[users.index(idx)]))
    user_text = " ".join(user_text)
    sim_output = calculate_similarity(user_text, word_map, word_df, available_arch_df)
    if idx == sim_output.idxmax():
        accuracy = accuracy + 1
    pbar.set_description(f"Current accuracy: {round(accuracy / len(available_arch_df), 2)}")

Current accuracy: 0.04: : 508it [24:44,  2.92s/it]


In [61]:
# Show the total count of correctly associated influencers
print(f"Total correct associations: {accuracy}")

Total correct associations: 19
