In [8]:
# Associate words with archetypes/character traits as intermediate layer
# and with influencer as the "last" layer

# Dependencies
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import os
import toml
import re
import itertools
from text_cleaner import *
import operator
from collections import Counter

def extract_hashtags(post_text):
    HASH_RE = re.compile(r"\#\w+")
    out_list = re.findall(HASH_RE, post_text)
    return out_list

In [9]:
# Load the .csv with archetypes
arch_df = pd.read_csv('archetypes_pl.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [10]:
# Table preprocessing - replace all NaN with 2 (Unrelated/Don't know class), replace 0-5 values with the ones in range -1.0 - 1.0
arch_df = arch_df.fillna(2.0)

arch_df = arch_df.replace(0.0, -1.0)
arch_df = arch_df.replace(1.0, -0.5)
arch_df = arch_df.replace(2.0, 0.0)
arch_df = arch_df.replace(3.0, 0.5)
arch_df = arch_df.replace(4.0, 1.0)

# Remove duplicated annotations, to exclude conflicting entries
arch_df = arch_df[~arch_df.index.duplicated(keep='first')]

# Print the head of the dataset after modification
arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,...,0.0,0.5,1.0,1.0,0.5,1.0,1.0,0.5,-1.0,-1.0
vege_style_life,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.0,-1.0,0.5
oliwka__2007,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,0.0,0.0,-1.0,0.5,-0.5,0.0,1.0,-0.5,-1.0,0.5
z_przestrzeni_serca,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,0.5,-1.0,1.0,1.0,0.5,1.0,1.0,-1.0,-0.5
zaradne_warsztaty,0.5,-1.0,-1.0,-1.0,0.5,-1.0,-1.0,0.0,0.5,1.0,...,0.5,1.0,-1.0,0.0,0.0,1.0,0.0,0.5,-0.5,0.5


In [11]:
# Check if a user has a non-empty directory in the dataset, otherwise delete the user from the list
available_arch_df = copy.deepcopy(arch_df)
posts = []

BASE_DIR = "instagram_cleared"

# Iterate over whole DataFrame
for i, row in tqdm(arch_df.iterrows()):
    profile_posts = []
    profile_hashtags = []
    
    # Get all posts per profile
    profile_path = os.path.join(BASE_DIR, i)
    for file in os.listdir(profile_path):
        if not file.endswith(".toml"):
            with open(os.path.join(profile_path, file), "r") as post_f:
                read_text = post_f.read()
                profile_posts.append(remove_stopwords(clean_up_text(read_text)))
                profile_hashtags.append(extract_hashtags(read_text))

    # Merge lists - a single list for a single influencer
    profile_hashtags = list(itertools.chain.from_iterable(profile_hashtags))
    posts.append(list(itertools.chain.from_iterable([profile_posts, [profile_hashtags]])))

508it [00:11, 45.50it/s]


In [13]:
# Show the current, filtered out database
print(arch_df.head())
print(f"Dataset length: {len(available_arch_df)}")

                     innocent  sage  explorer  outlaw  magician  hero  lover  \
id                                                                             
marek_grodzki            -1.0  -1.0      -1.0    -1.0      -1.0  -1.0   -1.0   
vege_style_life          -1.0  -1.0       1.0    -1.0      -1.0  -1.0   -1.0   
oliwka__2007             -1.0  -1.0      -1.0    -1.0      -1.0  -1.0   -1.0   
z_przestrzeni_serca       1.0  -1.0      -1.0    -1.0      -1.0  -1.0   -1.0   
zaradne_warsztaty         0.5  -1.0      -1.0    -1.0       0.5  -1.0   -1.0   

                     jester  everyman  caregiver  ...  protective  generous  \
id                                                ...                         
marek_grodzki           1.0      -1.0       -1.0  ...         0.0       0.5   
vege_style_life        -1.0      -1.0       -1.0  ...         1.0       1.0   
oliwka__2007           -1.0       1.0       -1.0  ...         0.0       0.0   
z_przestrzeni_serca    -1.0      -1.0       

In [14]:
# Map usernames to indices
users = list(arch_df.index.values)
user_indices = {k: users.index(k) for k in users}

In [15]:
# Get the word count and create a dataframe, where columns are archetypes/traits, and rows are single words
# Initialize a word DataFrame
word_df = pd.DataFrame()

# Iterate over all of the traits/archetypes
for trait in tqdm(trait_list):
    # Select influencers which have the given archetype annotated
    subset_df = arch_df[available_arch_df[trait] != 0][trait]
    subset_indices = [user_indices[idx] for idx in subset_df.index.values]
    
    # Get all posts for the list of influencers
    f = operator.itemgetter(*subset_indices)
    sublist = list(f(posts))
    
    # Counter to calculate each word occurrences
    trait_ctr = Counter(itertools.chain.from_iterable(itertools.chain.from_iterable(sublist)))
    trait_total = sum(trait_ctr.values())
    trait_ctr = {k: float(v / trait_total) for k, v in trait_ctr.items() if v >= 1}
    trait_ctr = {trait: trait_ctr}
    
    # Append the new dataframe
    tmp_df = pd.DataFrame.from_dict(trait_ctr, orient="index")
    word_df = word_df.append(tmp_df)

100%|██████████| 37/37 [04:22<00:00,  7.10s/it]


In [16]:
# Check the resulting DataFrame
word_df

Unnamed: 0,zainspirowany,wczorajszym,wywiadem,odnośnie,relacji,chciałem,przekazać,okres,kwarantanny,świetny,...,#suplement,#iifym,#sportwater,#vitamind,#danio,#zdrowieciasto,#cocunut,#schoko,#żywieniewsporcie,#wysiłekfizyczny
innocent,5e-06,2.6e-05,2.154819e-06,3.7e-05,0.00019,1.5e-05,2.8e-05,6.8e-05,5.2e-05,6.6e-05,...,,,,,,,,,,
sage,5e-06,2.5e-05,2.513561e-06,5.2e-05,0.000201,2.3e-05,4.1e-05,9e-05,6.2e-05,6e-05,...,,,,,,,,,,
explorer,7e-06,2.5e-05,2.664794e-06,4e-05,0.000173,2.3e-05,2.8e-05,8.4e-05,5.2e-05,6.1e-05,...,,,,,,,,,,
outlaw,6e-06,2.4e-05,3.002088e-06,4.5e-05,0.000197,1.8e-05,3.3e-05,9.6e-05,6e-05,6.2e-05,...,,,,,,,,,,
magician,6e-06,2.5e-05,3.102767e-06,4.7e-05,0.000192,2.2e-05,3.3e-05,9.2e-05,4.7e-05,6.1e-05,...,,,,,,,,,,
hero,8e-06,3.1e-05,3.148694e-06,3.9e-05,0.000186,3.9e-05,3.1e-05,7.7e-05,6e-05,4.6e-05,...,,,,,,,,,,
lover,5e-06,2e-05,2.492206e-06,3.9e-05,0.000189,1.5e-05,2.9e-05,8.3e-05,4.9e-05,7.2e-05,...,,,,,,,,,,
jester,7e-06,2.6e-05,2.918366e-06,4.4e-05,0.000191,3.6e-05,2.9e-05,8.5e-05,5.5e-05,6e-05,...,,,,,,,,,,
everyman,4e-06,2.9e-05,2.065766e-06,3.9e-05,0.000174,1.9e-05,2.6e-05,8.2e-05,4.9e-05,6.9e-05,...,,,,,,,,,,
caregiver,5e-06,2.2e-05,2.611096e-06,5.1e-05,0.000171,1.8e-05,2.7e-05,7.6e-05,3.9e-05,5.7e-05,...,,,,,,,,,,


In [17]:
# Save the results to a pickle
import pickle

with open("new_influencer_index_map.pickle", "wb") as f:
    pickle.dump(user_indices, f)
    
word_df.to_pickle("word_frequency_array.pickle")

In [68]:
# Method for creating an output vector for dot product calculation
# Word map - to easily create output vectors
word_map = word_df.columns.tolist()

def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    filtered_post += extract_hashtags(post_text)
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    word_ctr = Counter(filtered_post)
    
    for word, freq in word_ctr.items():
        if word in word_map:
            post_vector[word_map.index(word)] = freq
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    return word_dot.tolist()

In [69]:
# Replace NaN with 0 in word_frequency_table
word_df = word_df.fillna(0)

In [70]:
# Test the trait dot_product
print(get_trait_dot_product("Cześć reasda  asdasda         #hello #man", word_map, word_df))

[0.0004595412246773638, 0.0004067379914374121, 0.00036011625250653603, 0.0003753972072562694, 0.0003912542730293551, 0.000390015600624025, 0.00041324805618997055, 0.00037380561425308855, 0.00041593127877269414, 0.0003977570834036244, 0.0003560861361350337, 0.00039720828775092397, 0.00040023973122045266, 0.00041646063180421686, 0.00041872531414636344, 0.00041080436113212726, 0.0003931986611286199, 0.00038254458541335366, 0.00039270959902794653, 0.0004000795833982111, 0.000394006561962689, 0.0004129534731954682, 0.00042783294571025095, 0.00043415972913430425, 0.0004178359194453615, 0.00039384604731129745, 0.0004233620278707779, 0.00040216264575707657, 0.0004039563822718124, 0.0004086308504481948, 0.00042340748230535895, 0.0004077184969681802, 0.0004185307251977684, 0.0004109454121850409, 0.0004220094873148913, 0.00041851536852660837, 0.0004051762828504463]


In [71]:
# Method for calculating the dot product of trait <-> influencer relation
def get_influencer_dot_product(trait_output: list, influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    return influencer_dataframe.dot(trait_output)

In [72]:
# Method for calculating the similarity
def calculate_similarity(post_text: str, 
                         word_map: list, 
                         word_dataframe: pd.DataFrame,
                         influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    # Calculate word-trait dot product
    post_result = get_trait_dot_product(post_text, word_map, word_dataframe)
    
    # Calculate trate-influencer dot-product
    return get_influencer_dot_product(post_result, influencer_dataframe)

In [77]:
# Test the method
sim_df = calculate_similarity("""Jak to jest być skrybą, dobrze? 
A, wie pan, moim zdaniem to nie ma tak, że dobrze, albo że niedobrze. 
Gdybym miał powiedzieć, co cenię w życiu najbardziej, powiedziałbym, że ludzi. 
Ludzi, którzy podali mi pomocną dłoń, kiedy sobie nie radziłem, kiedy byłem sam, i co ciekawe, to właśnie przypadkowe spotkania wpływają na nasze życie. 
Chodzi o to, że kiedy wyznaje się pewne wartości, nawet pozornie uniwersalne, bywa, że nie znajduje się zrozumienia, 
które by tak rzec, które pomaga się nam rozwijać. 
Ja miałem szczęście, by tak rzec, ponieważ je znalazłem, i dziękuję życiu! 
Dziękuję mu; życie to śpiew, życie to taniec, życie to miłość! 
Wielu ludzi pyta mnie o to samo: ale jak ty to robisz, skąd czerpiesz tę radość? 
A ja odpowiadam, że to proste! To umiłowanie życia. 
To właśnie ono sprawia, że dzisiaj na przykład buduję maszyny, a jutro – kto wie? 
Dlaczego by nie – oddam się pracy społecznej i będę, ot, choćby, sadzić... doć— m-marchew...""", word_map, word_df, available_arch_df)
print("Maximum similarity:\n"
        f"User: {sim_df.idxmax()}\n"
        f"Similarity score: {sim_df.max()}")

Maximum similarity:
User: kasper.mackowiak
Similarity score: 0.2007841327059235
