In [1]:
# Associate words with archetypes/character traits as intermediate layer
# and with influencer as the "last" layer

# Dependencies
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import os
import toml
import re
import itertools
from text_cleaner import *
import operator
from collections import Counter
import pickle

def extract_hashtags(post_text):
    HASH_RE = re.compile(r"\#\w+")
    out_list = re.findall(HASH_RE, post_text)
    return out_list

In [2]:
# Load the .csv with archetypes
arch_df = pd.read_csv('archetypes_pl.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [3]:
# Table preprocessing - replace all NaN with 2 (Unrelated/Don't know class), replace 0-5 values with the ones in range -1.0 - 1.0
arch_df = arch_df.fillna(2.0)

arch_df = arch_df.replace(0.0, -1.0)
arch_df = arch_df.replace(1.0, -0.5)
arch_df = arch_df.replace(2.0, 0.0)
arch_df = arch_df.replace(3.0, 0.5)
arch_df = arch_df.replace(4.0, 1.0)

# Remove duplicated annotations, to exclude conflicting entries
arch_df = arch_df[~arch_df.index.duplicated(keep='first')]

# Print the head of the dataset after modification
arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,...,0.0,0.5,1.0,1.0,0.5,1.0,1.0,0.5,-1.0,-1.0
vege_style_life,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.0,-1.0,0.5
oliwka__2007,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,0.0,0.0,-1.0,0.5,-0.5,0.0,1.0,-0.5,-1.0,0.5
z_przestrzeni_serca,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,0.5,-1.0,1.0,1.0,0.5,1.0,1.0,-1.0,-0.5
zaradne_warsztaty,0.5,-1.0,-1.0,-1.0,0.5,-1.0,-1.0,0.0,0.5,1.0,...,0.5,1.0,-1.0,0.0,0.0,1.0,0.0,0.5,-0.5,0.5


In [4]:
# Check if a user has a non-empty directory in the dataset, otherwise delete the user from the list
available_arch_df = copy.deepcopy(arch_df)
posts = []

BASE_DIR = "instagram_cleared"

# Iterate over whole DataFrame
for i, row in tqdm(arch_df.iterrows()):
    profile_posts = []
    profile_hashtags = []
    
    # Get all posts per profile
    profile_path = os.path.join(BASE_DIR, i)
    for file in os.listdir(profile_path):
        if not file.endswith(".toml"):
            with open(os.path.join(profile_path, file), "r") as post_f:
                read_text = post_f.read()
                profile_posts.append(remove_stopwords(clean_up_text(read_text)))
                profile_hashtags.append(extract_hashtags(read_text))

    # Merge lists - a single list for a single influencer
    profile_hashtags = list(itertools.chain.from_iterable(profile_hashtags))
    posts.append(list(itertools.chain.from_iterable([profile_posts, [profile_hashtags]])))

508it [00:11, 44.07it/s]


In [5]:
# Map usernames to indices
users = list(available_arch_df.index.values)
user_indices = {k: users.index(k) for k in users}

In [6]:
# Load the required pickles
with open("word_frequency_array.pickle", "rb") as f:
    word_df = pickle.load(f)

# Word map - to easily create output vectors
word_map = word_df.columns.tolist()

In [7]:
def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    filtered_post += extract_hashtags(post_text)
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    word_ctr = Counter(filtered_post)
    
    for word, freq in word_ctr.items():
        if word in word_map:
            post_vector[word_map.index(word)] = freq
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    return word_dot

# Replace NaN with 0 in word_frequency_table
word_df = word_df.fillna(0)

# Method for calculating the dot product of trait <-> influencer relation
def get_influencer_dot_product(trait_output: list, influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    return influencer_dataframe.dot(trait_output)

# Method for calculating the similarity
def calculate_similarity(post_text: str, 
                         word_map: list, 
                         word_dataframe: pd.DataFrame,
                         influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    # Calculate word-trait dot product
    post_result = get_trait_dot_product(post_text, word_map, word_dataframe)
    print(post_result)
    
    # Calculate trate-influencer dot-product
    inf_dot_product = get_influencer_dot_product(post_result, influencer_dataframe)
    print(f"Influencer dot product: {inf_dot_product}")
    
    # Get the scores out of the user DataFrame
    influencer_sum = influencer_dataframe.sum(axis=1) # Divide by non-zero/sum of counts
    
    influencer_diff = influencer_sum.subtract(inf_dot_product).abs()
    influencer_diff = influencer_diff / influencer_sum
    influencer_diff = influencer_diff[(influencer_diff != 0)]
    print(f"Influencer vs dot difference: {influencer_diff}")

    return influencer_diff

In [8]:
pbar = tqdm(available_arch_df.iterrows())
accuracy = 0

for idx, row in pbar:
    user_text = list(itertools.chain.from_iterable(posts[users.index(idx)]))
    user_text = " ".join(user_text)
    sim_output = calculate_similarity(user_text, word_map, word_df, available_arch_df)
    #print(f"{sim_output.idxmin()} - value: {sim_output.loc[sim_output.idxmin()]}")
    if idx == sim_output.idxmin():
        accuracy = accuracy + 1
    input("aaa")
    pbar.set_description(f"Current accuracy: {round(accuracy / len(available_arch_df), 2)}")

0it [00:00, ?it/s]

innocent          0.786035
sage              0.860350
explorer          0.805136
outlaw            0.801448
magician          0.815860
hero              0.844207
lover             0.775010
jester            0.805618
everyman          0.791806
caregiver         0.800002
ruler             0.844328
creator           0.786800
dominant          0.807172
submissive        0.798319
maximalist        0.807154
minimalist        0.808577
inspiring         0.796300
systematic        0.809713
discovering       0.798322
conservative      0.801840
verifying         0.806732
overlooking       0.800863
sharpening        0.815766
harmonic          0.798846
empathic          0.803468
matter_of_fact    0.813309
brave             0.815083
protective        0.795028
generous          0.802524
thrifty           0.826236
favourable        0.794502
balanced          0.808445
sensuality        0.794300
intelligent       0.808801
believe           0.811199
egocentric        0.803588
allocentric       0.800753
d

0it [04:36, ?it/s]


KeyboardInterrupt: Interrupted by user

In [9]:
word_df

Unnamed: 0,zainspirowany,wczorajszym,wywiadem,odnośnie,relacji,chciałem,przekazać,okres,kwarantanny,świetny,...,#suplement,#iifym,#sportwater,#vitamind,#danio,#zdrowieciasto,#cocunut,#schoko,#żywieniewsporcie,#wysiłekfizyczny
innocent,5e-06,2.6e-05,2.154819e-06,3.7e-05,0.00019,1.5e-05,2.8e-05,6.8e-05,5.2e-05,6.6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sage,5e-06,2.5e-05,2.513561e-06,5.2e-05,0.000201,2.3e-05,4.1e-05,9e-05,6.2e-05,6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
explorer,7e-06,2.5e-05,2.664794e-06,4e-05,0.000173,2.3e-05,2.8e-05,8.4e-05,5.2e-05,6.1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
outlaw,6e-06,2.4e-05,3.002088e-06,4.5e-05,0.000197,1.8e-05,3.3e-05,9.6e-05,6e-05,6.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
magician,6e-06,2.5e-05,3.102767e-06,4.7e-05,0.000192,2.2e-05,3.3e-05,9.2e-05,4.7e-05,6.1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hero,8e-06,3.1e-05,3.148694e-06,3.9e-05,0.000186,3.9e-05,3.1e-05,7.7e-05,6e-05,4.6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lover,5e-06,2e-05,2.492206e-06,3.9e-05,0.000189,1.5e-05,2.9e-05,8.3e-05,4.9e-05,7.2e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
jester,7e-06,2.6e-05,2.918366e-06,4.4e-05,0.000191,3.6e-05,2.9e-05,8.5e-05,5.5e-05,6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
everyman,4e-06,2.9e-05,2.065766e-06,3.9e-05,0.000174,1.9e-05,2.6e-05,8.2e-05,4.9e-05,6.9e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
caregiver,5e-06,2.2e-05,2.611096e-06,5.1e-05,0.000171,1.8e-05,2.7e-05,7.6e-05,3.9e-05,5.7e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
