In [1]:
# Associate words with archetypes/character traits as intermediate layer
# and with influencer as the "last" layer

# Dependencies
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import os
import toml
import re
import itertools
from text_cleaner import *
import operator
from collections import Counter
import pickle

def extract_hashtags(post_text):
    HASH_RE = re.compile(r"\#\w+")
    out_list = re.findall(HASH_RE, post_text)
    return out_list

In [2]:
# Load the .csv with archetypes
arch_df = pd.read_csv('archetypes_pl.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [3]:
# Table preprocessing - replace all NaN with 2 (Unrelated/Don't know class), replace 0-5 values with the ones in range -1.0 - 1.0
arch_df = arch_df.fillna(2.0)

# arch_df = arch_df.replace(0.0, -1.0)
# arch_df = arch_df.replace(1.0, -0.5)
# arch_df = arch_df.replace(2.0, 0.0)
# arch_df = arch_df.replace(3.0, 0.5)
# arch_df = arch_df.replace(4.0, 1.0)

# Remove duplicated annotations, to exclude conflicting entries
arch_df = arch_df[~arch_df.index.duplicated(keep='first')]

# Print the head of the dataset after modification
arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [4]:
# Check if a user has a non-empty directory in the dataset, otherwise delete the user from the list
available_arch_df = copy.deepcopy(arch_df)
posts = []

BASE_DIR = "instagram_cleared"

# Iterate over whole DataFrame
for i, row in tqdm(arch_df.iterrows()):
    profile_posts = []
    profile_hashtags = []
    
    # Get all posts per profile
    profile_path = os.path.join(BASE_DIR, i)
    for file in os.listdir(profile_path):
        if not file.endswith(".toml"):
            with open(os.path.join(profile_path, file), "r") as post_f:
                read_text = post_f.read()
                profile_posts.append(remove_stopwords(clean_up_text(read_text)))
                profile_hashtags.append(extract_hashtags(read_text))

    # Merge lists - a single list for a single influencer
    profile_hashtags = list(itertools.chain.from_iterable(profile_hashtags))
    posts.append(list(itertools.chain.from_iterable([profile_posts, [profile_hashtags]])))

508it [00:21, 24.01it/s]


In [5]:
# Map usernames to indices
users = list(available_arch_df.index.values)
user_indices = {k: users.index(k) for k in users}

In [6]:
# Load the required pickles
with open("word_trait_array_non_negative.pickle", "rb") as f:
    word_df = pickle.load(f)

# Word map - to easily create output vectors
word_map = word_df.columns.tolist()

In [7]:
def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    filtered_post += extract_hashtags(post_text)
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    word_ctr = Counter(filtered_post)
    
    for word, freq in word_ctr.items():
        if word in word_map:
            post_vector[word_map.index(word)] = freq
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    return word_dot

In [8]:
# Create new table associating the newly-generated phrase-per-archetypes with influencers
new_arch_df = arch_df.copy(deep=True)

In [9]:
# Read the table from file
new_arch_df = pd.read_csv("influencer_recalc.csv", header=0, index_col=0)

In [10]:
# Look up the new arch_df table
new_arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,2.038094,1.973386,1.709381,1.483618,1.700735,1.620412,1.722918,1.656595,2.185022,1.860071,...,1.877358,2.186279,1.566861,2.213885,1.919468,2.149044,2.436826,2.118127,1.6293,2.173451
vege_style_life,0.458066,0.418103,0.393193,0.316075,0.376222,0.334088,0.396356,0.357135,0.493878,0.417618,...,0.427948,0.492265,0.358717,0.473784,0.428801,0.490582,0.520035,0.47826,0.356747,0.48981
oliwka__2007,0.177468,0.172341,0.139999,0.120608,0.142124,0.118177,0.151286,0.142594,0.209924,0.149987,...,0.156509,0.1884,0.119193,0.19496,0.170495,0.212068,0.230556,0.185316,0.151228,0.191192
z_przestrzeni_serca,0.816684,0.741473,0.648054,0.561854,0.650933,0.596207,0.67423,0.628383,0.836565,0.71215,...,0.740277,0.839179,0.57593,0.812082,0.760188,0.839103,0.93484,0.83556,0.587851,0.842156
zaradne_warsztaty,1.020925,0.894972,0.786119,0.674316,0.829504,0.713851,0.845061,0.797857,1.074282,0.926031,...,0.902334,1.06104,0.706353,0.976221,0.933323,1.061872,1.137518,1.033451,0.717548,1.077945


In [72]:
# Iterate over all of the users to calculate new weights
pbar = tqdm(new_arch_df.iterrows())

for idx, row in pbar:
    user_text = list(itertools.chain.from_iterable(posts[users.index(idx)]))
    user_text = " ".join(user_text)
    sim_output = get_trait_dot_product(user_text, word_map, word_df)
    new_arch_df.loc[idx] = sim_output

508it [08:52,  1.05s/it]


In [73]:
# Look up the new arch_df table after alteration
new_arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,2.038094,1.973386,1.709381,1.483618,1.700735,1.620412,1.722918,1.656595,2.185022,1.860071,...,1.877358,2.186279,1.566861,2.213885,1.919468,2.149044,2.436826,2.118127,1.6293,2.173451
vege_style_life,0.458066,0.418103,0.393193,0.316075,0.376222,0.334088,0.396356,0.357135,0.493878,0.417618,...,0.427948,0.492265,0.358717,0.473784,0.428801,0.490582,0.520035,0.47826,0.356747,0.48981
oliwka__2007,0.177468,0.172341,0.139999,0.120608,0.142124,0.118177,0.151286,0.142594,0.209924,0.149987,...,0.156509,0.1884,0.119193,0.19496,0.170495,0.212068,0.230556,0.185316,0.151228,0.191192
z_przestrzeni_serca,0.816684,0.741473,0.648054,0.561854,0.650933,0.596207,0.67423,0.628383,0.836565,0.71215,...,0.740277,0.839179,0.57593,0.812082,0.760188,0.839103,0.93484,0.83556,0.587851,0.842156
zaradne_warsztaty,1.020925,0.894972,0.786119,0.674316,0.829504,0.713851,0.845061,0.797857,1.074282,0.926031,...,0.902334,1.06104,0.706353,0.976221,0.933323,1.061872,1.137518,1.033451,0.717548,1.077945


In [74]:
# Save new .csv to file
new_arch_df.to_csv("influencer_recalc.csv")

## Proof-checking

In [79]:
# Method for calculating the dot product of trait <-> influencer relation
def get_influencer_dot_product(trait_output: list, influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    return influencer_dataframe.dot(trait_output)

# Method for calculating the similarity
def calculate_similarity(post_text: str, 
                         word_map: list, 
                         word_dataframe: pd.DataFrame,
                         influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    
    # Calculate word-trait dot product
    post_result = get_trait_dot_product(post_text, word_map, word_dataframe)
    
    # Calculate trait-influencer dot product
    inf_dot_product = get_influencer_dot_product(post_result, influencer_dataframe)

    # Get the sum of influencer traits
    influencer_sum = influencer_dataframe.sum(axis=1)
    
    # Normalize the results
    inf_dot_product = inf_dot_product.divide(influencer_sum)
    
    # Generate new dataframe - one row per influencer
    inf_df = pd.Series(index=influencer_dataframe.index)
    
    # Replace all data in temporary df with calculated post result
    for idx in inf_df.index:
        inf_df.loc[idx] = np.linalg.norm(influencer_dataframe.loc[idx] - post_result)
    
    return inf_df

In [81]:
pbar = tqdm(arch_df.iterrows())
accuracy = 0

for idx, row in pbar:
    user_text = list(itertools.chain.from_iterable(posts[users.index(idx)]))
    user_text = " ".join(user_text)
    sim_output = calculate_similarity(user_text, word_map, word_df, new_arch_df)
    if idx == sim_output.idxmin():
        accuracy = accuracy + 1
    pbar.set_description(f"Matched influencers: {accuracy}")

  inf_df = pd.Series(index=influencer_dataframe.index)
Matched influencers: 508: : 508it [09:49,  1.16s/it]


In [82]:
print(f"Accuracy: {accuracy / len(new_arch_df)}")

Accuracy: 1.0
