In [1]:
# Associate words with archetypes/character traits as intermediate layer
# and with influencer as the "last" layer

# Dependencies
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy
import os
import toml
import re
import itertools
from text_cleaner import *
import operator
from collections import Counter
from spmf import Spmf

def extract_hashtags(post_text):
    HASH_RE = re.compile(r"\#\w+")
    out_list = re.findall(HASH_RE, post_text)
    return out_list

In [2]:
# Load the .csv with archetypes
arch_df = pd.read_csv('archetypes_pl.csv', index_col=0)

# Save the order of columns
trait_list = arch_df.columns.tolist()

# Show the table header and column list
print(trait_list)
arch_df.head()

['innocent', 'sage', 'explorer', 'outlaw', 'magician', 'hero', 'lover', 'jester', 'everyman', 'caregiver', 'ruler', 'creator', 'dominant', 'submissive', 'maximalist', 'minimalist', 'inspiring', 'systematic', 'discovering', 'conservative', 'verifying', 'overlooking', 'sharpening', 'harmonic', 'empathic', 'matter_of_fact', 'brave', 'protective', 'generous', 'thrifty', 'favourable', 'balanced', 'sensuality', 'intelligent', 'believe', 'egocentric', 'allocentric']


Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,2.0,3.0,4.0,4.0,3.0,4.0,4.0,3.0,0.0,0.0
vege_style_life,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,0.0,3.0
oliwka__2007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,2.0,2.0,0.0,3.0,1.0,2.0,4.0,1.0,0.0,3.0
z_przestrzeni_serca,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.0,0.0,4.0,4.0,3.0,4.0,4.0,0.0,1.0
zaradne_warsztaty,3.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,3.0,4.0,...,3.0,4.0,0.0,2.0,2.0,4.0,2.0,3.0,1.0,3.0


In [3]:
# Table preprocessing - replace all NaN with 2 (Unrelated/Don't know class), replace 0-5 values with the ones in range -1.0 - 1.0
arch_df = arch_df.fillna(2.0)

arch_df = arch_df.replace(0.0, -1.0)
arch_df = arch_df.replace(1.0, -0.5)
arch_df = arch_df.replace(2.0, 0.0)
arch_df = arch_df.replace(3.0, 0.5)
arch_df = arch_df.replace(4.0, 1.0)

# Remove duplicated annotations, to exclude conflicting entries
arch_df = arch_df[~arch_df.index.duplicated(keep='first')]

# Print the head of the dataset after modification
arch_df.head()

Unnamed: 0_level_0,innocent,sage,explorer,outlaw,magician,hero,lover,jester,everyman,caregiver,...,protective,generous,thrifty,favourable,balanced,sensuality,intelligent,believe,egocentric,allocentric
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
marek_grodzki,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,...,0.0,0.5,1.0,1.0,0.5,1.0,1.0,0.5,-1.0,-1.0
vege_style_life,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.0,-1.0,0.5
oliwka__2007,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,0.0,0.0,-1.0,0.5,-0.5,0.0,1.0,-0.5,-1.0,0.5
z_przestrzeni_serca,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,0.5,-1.0,1.0,1.0,0.5,1.0,1.0,-1.0,-0.5
zaradne_warsztaty,0.5,-1.0,-1.0,-1.0,0.5,-1.0,-1.0,0.0,0.5,1.0,...,0.5,1.0,-1.0,0.0,0.0,1.0,0.0,0.5,-0.5,0.5


In [4]:
# Check if a user has a non-empty directory in the dataset, otherwise delete the user from the list
available_arch_df = copy.deepcopy(arch_df)
posts = []
hashtags = []

BASE_DIR = "instagram_dataset/pl"

# Iterate over whole DataFrame
for i, row in tqdm(arch_df.iterrows()):
    profile_posts = []
    profile_hashtags = []
    
    # Iterate over all categories in base directory
    for cat_dir in os.listdir(BASE_DIR):
        whole_cat_dir = os.path.join(BASE_DIR, cat_dir)
        
        # If profile exists in the database
        if i in os.listdir(whole_cat_dir):
            profile_path = os.path.join(whole_cat_dir, i)
            profile_config_path = os.path.join(whole_cat_dir, i, f"{i}.toml")
            
            # Check if there's a .toml file - if not, omit the profile
            is_present = False            
            if os.path.exists(profile_config_path):
                is_present = True
                for file in os.listdir(profile_path):
                    if not file.endswith(".toml"):
                        with open(os.path.join(profile_path, file), "r") as post_f:
                            read_text = post_f.read()
                            profile_posts.append(remove_stopwords(clean_up_text(read_text)))
                            profile_hashtags.append(extract_hashtags(read_text))
            else:
                available_arch_df = available_arch_df.drop(i, axis=0)
                print(f"Profile {i} has no posts.")
            # Create new list for a given user    
            if is_present:
                # Merge lists - a single list for a single influencer
                profile_hashtags = list(itertools.chain.from_iterable(profile_hashtags))
                hashtags.append(profile_hashtags)
                posts.append(list(itertools.chain.from_iterable([profile_posts])))
            break

9it [00:00, 39.30it/s]

Profile zaradne_warsztaty has no posts.


24it [00:00, 47.26it/s]

Profile ilona_browstylist has no posts.
Profile pracownia.lepiej has no posts.


35it [00:00, 44.27it/s]

Profile kamann_living has no posts.
Profile natalie_interiors has no posts.


51it [00:01, 45.58it/s]

Profile krewetkowo has no posts.
Profile eliza.gwiazda_official has no posts.
Profile gettinenglish has no posts.


72it [00:01, 45.44it/s]

Profile paulina.ihnat has no posts.
Profile home_in_garden has no posts.


87it [00:01, 44.00it/s]

Profile karola_moskal has no posts.
Profile wierzbowa_architektura has no posts.
Profile justka.ka has no posts.
Profile mamologia has no posts.
Profile kwejk has no posts.


109it [00:02, 37.05it/s]

Profile swiatwiedzy has no posts.


131it [00:03, 45.31it/s]

Profile klaudia_lasecka has no posts.
Profile dom.w.kwiatach has no posts.
Profile ak.kingamadej has no posts.
Profile owsianapl has no posts.
Profile _agnieszka_leszczynska has no posts.


153it [00:03, 45.81it/s]

Profile czarna.owieczka has no posts.
Profile hellohomla has no posts.
Profile kulturoholiczka has no posts.


163it [00:03, 41.36it/s]

Profile aniolnaresorach has no posts.
Profile krusia_domatorka has no posts.


175it [00:04, 46.90it/s]

Profile gotowanie_po_zmianie has no posts.


194it [00:04, 50.04it/s]

Profile ahojprzyrodo has no posts.
Profile mr.stejku has no posts.
Profile kinianieruda has no posts.


211it [00:04, 42.41it/s]

Profile remont_ciala has no posts.
Profile cooosure has no posts.


221it [00:05, 54.06it/s]

Profile martapazera has no posts.
Profile zdrowy_talerz has no posts.


236it [00:05, 56.84it/s]

Profile annamboland has no posts.
Profile marketing_w_pigulce has no posts.
Profile achdeco_polska has no posts.


250it [00:05, 51.60it/s]

Profile imsokayka has no posts.
Profile prosto.w.szarosci has no posts.


264it [00:05, 53.60it/s]

Profile podroze.rodzinne has no posts.
Profile fitbadurka has no posts.
Profile panifortuna has no posts.
Profile kinga_strzalka has no posts.


277it [00:06, 52.94it/s]

Profile gotowe_projekty_domow_archon has no posts.
Profile myblogyoll has no posts.


289it [00:06, 50.25it/s]

Profile zuzia_niemczycka has no posts.
Profile justa_w_ogrodzie has no posts.


307it [00:06, 52.32it/s]

Profile agiksonn has no posts.
Profile 620_nad_poziomem_morza has no posts.


313it [00:06, 47.05it/s]

Profile fit_gruszecka has no posts.
Profile wydawnictwoznakpl has no posts.


320it [00:07, 48.75it/s]

Profile martynagrajcke has no posts.


332it [00:07, 46.07it/s]

Profile staraochota_skrawki has no posts.
Profile nabakowskapracowniawnetrz has no posts.


354it [00:07, 43.91it/s]

Profile paulinarubaszka has no posts.
Profile karolina_er_ has no posts.


383it [00:08, 43.06it/s]

Profile aga_bugaj has no posts.
Profile lab.07 has no posts.


405it [00:09, 41.49it/s]

Profile pani_tester has no posts.
Profile blogtasteaway has no posts.


423it [00:09, 50.70it/s]

Profile onajedna_home has no posts.
Profile languagebay has no posts.


442it [00:09, 56.42it/s]

Profile alabasterfox has no posts.
Profile aleksandra.herec has no posts.
Profile rykalskaa has no posts.
Profile domiogrod_przedsnem has no posts.
Profile maddlajnn has no posts.


460it [00:10, 44.83it/s]

Profile poczujsielepiej has no posts.
Profile naszswiatt has no posts.


472it [00:10, 47.83it/s]

Profile zdrowoczylisexy has no posts.
Profile rutynowa has no posts.
Profile aga.lanius has no posts.
Profile balickadesign has no posts.


504it [00:11, 55.86it/s]

Profile nowinki.sklepowe has no posts.
Profile moda_na_klasyki has no posts.


508it [00:11, 45.53it/s]


In [5]:
# Show the current, filtered out database
print(available_arch_df.head())
print(f"Available dataset length: {len(available_arch_df)}")

                     innocent  sage  explorer  outlaw  magician  hero  lover  \
id                                                                             
marek_grodzki            -1.0  -1.0      -1.0    -1.0      -1.0  -1.0   -1.0   
vege_style_life          -1.0  -1.0       1.0    -1.0      -1.0  -1.0   -1.0   
oliwka__2007             -1.0  -1.0      -1.0    -1.0      -1.0  -1.0   -1.0   
z_przestrzeni_serca       1.0  -1.0      -1.0    -1.0      -1.0  -1.0   -1.0   
snatch.machine           -1.0  -0.5       0.5     0.5       0.0   1.0   -1.0   

                     jester  everyman  caregiver  ...  protective  generous  \
id                                                ...                         
marek_grodzki           1.0      -1.0       -1.0  ...         0.0       0.5   
vege_style_life        -1.0      -1.0       -1.0  ...         1.0       1.0   
oliwka__2007           -1.0       1.0       -1.0  ...         0.0       0.0   
z_przestrzeni_serca    -1.0      -1.0       

In [6]:
# Map usernames to indices
users = list(available_arch_df.index.values)
user_indices = {k: users.index(k) for k in users}

In [23]:
# Get the word count and create a dataframe, where columns are archetypes/traits, and rows are single words
# Initialize a word DataFrame
word_df = pd.DataFrame()

# Iterate over all of the traits/archetypes
for trait in tqdm(trait_list):
    # Test procedure for a single trait
    subset_df = available_arch_df[available_arch_df[trait] != 0][trait]
    subset_indices = [user_indices[idx] for idx in subset_df.index.values]

    # Get all posts for the list of influencers
    f = operator.itemgetter(*subset_indices)
    sublist = list(f(posts))
    post_list = []
    for user in sublist:
        for post in user:
            post_list.append(" ".join(post))

    # SPMF - get the most frequent sequences
    spmf = Spmf("PrefixSpan", input_direct=post_list,
                output_filename="output.txt", arguments=[0.00025, 3], input_type="text")
    spmf.run()
    spmf_df = spmf.to_pandas_dataframe(pickle=False)

    # Normalize the data for all lengths
    spmf_df.loc[spmf_df["pattern"].map(len) == 3, "sup"] = spmf_df.loc[spmf_df["pattern"].map(len) == 3].sup / spmf_df[spmf_df["pattern"].map(len) == 3]["sup"].sum()
    spmf_df.loc[spmf_df["pattern"].map(len) == 2, "sup"] = spmf_df.loc[spmf_df["pattern"].map(len) == 2].sup / spmf_df[spmf_df["pattern"].map(len) == 2]["sup"].sum()
    spmf_df.loc[spmf_df["pattern"].map(len) == 1, "sup"] = spmf_df.loc[spmf_df["pattern"].map(len) == 1].sup / spmf_df[spmf_df["pattern"].map(len) == 1]["sup"].sum()

    # Convert lists to tuples
    spmf_df["pattern"] = spmf_df["pattern"].apply(lambda x: str(x))

    sublist = list(f(hashtags))

    # Counter to calculate each word occurrences
    sublist = list(itertools.chain.from_iterable(sublist))
    trait_ctr = Counter(sublist)
    trait_total = sum(trait_ctr.values())
    trait_ctr = {k: float(v / trait_total) for k, v in trait_ctr.items() if v >= 1}
    trait_ctr = {trait: trait_ctr}

    tmp_df = pd.DataFrame.from_dict(trait_ctr, orient="columns")

    # Change column name
    spmf_df = spmf_df.rename(columns={"sup": trait})

    # Change index to pattern
    spmf_df = spmf_df.reset_index(drop=True)
    spmf_df = spmf_df.set_index("pattern")

    # Append the hashtags
    spmf_df = spmf_df.append(tmp_df)
    print(spmf_df)

    spmf_df = spmf_df.transpose()
    spmf_df.to_pickle(f"dfs/{trait}.pickle")

    # Append the dataframe to word_df
    word_df = word_df.append(spmf_df)

  0%|          | 0/37 [00:00<?, ?it/s]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6171 ms
 Frequent sequences count : 1862729
 Max memory (mb) : 1029.111328125
 minsup = 4 sequences.
 Pattern count : 1862729

Post-processing to show result in terms of string values.
Post-processing completed.

                                         innocent
['zainspirowany']                    1.298465e-05
['wczorajszym']                      5.193861e-05
['wczorajszym', 'poście']            2.412996e-06
['wczorajszym', 'poście', 'innymi']  4.845665e-07
['wczorajszym', 'naprawdę']          1.930397e-06
...                                           ...
#여성                                  3.679975e-06
#ﬁtnessgirl                          3.679975e-06
#ａｅｓｔｈｅｔｉｃ                           5.151965e-05
#𝐞𝐜𝐨𝐃𝐲𝐰                              3.679975e-06
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 3.679975e-06

[1913357 rows x 1 columns]


  3%|▎         | 1/37 [00:22<13:33, 22.59s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 7880 ms
 Frequent sequences count : 3318340
 Max memory (mb) : 1131.62109375
 minsup = 3 sequences.
 Pattern count : 3318340

Post-processing to show result in terms of string values.
Post-processing completed.

                                             sage
['zainspirowany']                    1.159027e-05
['wczorajszym']                      5.505379e-05
['wczorajszym', 'poście']            2.071720e-06
['wczorajszym', 'poście', 'innymi']  3.600215e-07
['wczorajszym', 'naprawdę']          1.657376e-06
...                                           ...
#інтер                               4.562231e-06
#ねこ                                  9.124462e-06
#猫                                   9.124462e-06
#ﬁtnessgirl                          4.562231e-06
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 4.562231e-06

[3362192 rows x 1 columns]


  5%|▌         | 2/37 [01:30<28:48, 49.37s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6922 ms
 Frequent sequences count : 3474220
 Max memory (mb) : 1060.6376953125
 minsup = 3 sequences.
 Pattern count : 3474220

Post-processing to show result in terms of string values.
Post-processing completed.

                                         explorer
['zainspirowany']                    1.625472e-05
['wczorajszym']                      5.851698e-05
['wczorajszym', 'poście']            2.364591e-06
['wczorajszym', 'poście', 'innymi']  3.269538e-07
['wczorajszym', 'naprawdę']          1.418754e-06
...                                           ...
#猫                                   8.769122e-06
#ﬁtnessgirl                          4.384561e-06
#ａｅｓｔｈｅｔｉｃ                           4.384561e-05
#ｔｈｏｕｇｈｔｓ                            4.384561e-06
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 4.384561e-06

[3519096 rows x 1 columns]


  8%|▊         | 3/37 [03:23<44:23, 78.35s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6257 ms
 Frequent sequences count : 3025204
 Max memory (mb) : 1024.6738204956055
 minsup = 3 sequences.
 Pattern count : 3025204

Post-processing to show result in terms of string values.
Post-processing completed.

                                           outlaw
['zainspirowany']                    1.422910e-05
['wczorajszym']                      5.691641e-05
['wczorajszym', 'poście']            2.762100e-06
['wczorajszym', 'poście', 'innymi']  3.827676e-07
['wczorajszym', 'naprawdę']          1.657260e-06
...                                           ...
#ねこ                                  1.000655e-05
#猫                                   1.000655e-05
#ﬁtnessgirl                          5.003277e-06
#ａｅｓｔｈｅｔｉｃ                           5.003277e-05
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 5.003277e-06

[3067681 rows x 1 columns]


 11%|█         | 4/37 [05:14<50:11, 91.25s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6314 ms
 Frequent sequences count : 3051974
 Max memory (mb) : 942.35009765625
 minsup = 3 sequences.
 Pattern count : 3051974

Post-processing to show result in terms of string values.
Post-processing completed.

                                         magician
['zainspirowany']                    1.482937e-05
['wczorajszym']                      5.931748e-05
['wczorajszym', 'poście']            2.782643e-06
['wczorajszym', 'poście', 'innymi']  3.768889e-07
['wczorajszym', 'naprawdę']          1.669586e-06
...                                           ...
#猫                                   1.050873e-05
#美少女戦士セーラームーン                        5.254364e-06
#ﬁtnessgirl                          5.254364e-06
#ａｅｓｔｈｅｔｉｃ                           5.254364e-05
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 5.254364e-06

[3092284 rows x 1 columns]


 14%|█▎        | 5/37 [07:10<53:25, 100.16s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6152 ms
 Frequent sequences count : 3045090
 Max memory (mb) : 956.677734375
 minsup = 3 sequences.
 Pattern count : 3045090

Post-processing to show result in terms of string values.
Post-processing completed.

                                             hero
['zainspirowany']                    1.849530e-05
['wczorajszym']                      7.398119e-05
['wczorajszym', 'poście']            2.810517e-06
['wczorajszym', 'poście', 'innymi']  3.792890e-07
['wczorajszym', 'naprawdę']          1.686310e-06
...                                           ...
#猫                                   1.039652e-05
#ﬁtbody                              5.198262e-06
#ﬁtnessgirl                          5.198262e-06
#ａｅｓｔｈｅｔｉｃ                           5.198262e-05
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 5.198262e-06

[3084848 rows x 1 columns]


 16%|█▌        | 6/37 [09:01<53:42, 103.96s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 5447 ms
 Frequent sequences count : 1684610
 Max memory (mb) : 1066.08349609375
 minsup = 4 sequences.
 Pattern count : 1684610

Post-processing to show result in terms of string values.
Post-processing completed.

                                            lover
['zainspirowany']                    1.217345e-05
['wczorajszym']                      4.565043e-05
['wczorajszym', 'poście']            2.885330e-06
['wczorajszym', 'poście', 'innymi']  5.350438e-07
['wczorajszym', 'innymi']            2.308264e-06
...                                           ...
#ﬁtfam                               4.215105e-06
#ﬁtnessgirl                          4.215105e-06
#ａｅｓｔｈｅｔｉｃ                           4.215105e-05
#𝐞𝐜𝐨𝐃𝐲𝐰                              4.215105e-06
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 4.215105e-06

[1730639 rows x 1 columns]


 19%|█▉        | 7/37 [10:22<48:14, 96.49s/it] 

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6130 ms
 Frequent sequences count : 3220611
 Max memory (mb) : 937.3232421875
 minsup = 3 sequences.
 Pattern count : 3220611

Post-processing to show result in terms of string values.
Post-processing completed.

                                           jester
['zainspirowany']                    1.712851e-05
['wczorajszym']                      5.823693e-05
['wczorajszym', 'poście']            2.545038e-06
['wczorajszym', 'poście', 'innymi']  3.576034e-07
['wczorajszym', 'naprawdę']          1.527023e-06
...                                           ...
#猫                                   9.677076e-06
#ﬁtfam                               4.838538e-06
#ﬁtnessgirl                          4.838538e-06
#ａｅｓｔｈｅｔｉｃ                           4.838538e-05
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 4.838538e-06

[3263198 rows x 1 columns]


 22%|██▏       | 8/37 [12:07<47:54, 99.12s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6813 ms
 Frequent sequences count : 1982895
 Max memory (mb) : 1155.046875
 minsup = 4 sequences.
 Pattern count : 1982895

Post-processing to show result in terms of string values.
Post-processing completed.

                                         everyman
['zainspirowany']                    1.008090e-05
['wczorajszym']                      6.048540e-05
['wczorajszym', 'poście']            2.243302e-06
['wczorajszym', 'poście', 'innymi']  4.506976e-07
['wczorajszym', 'naprawdę']          1.794642e-06
...                                           ...
#美少女戦士セーラームーン                        3.542005e-06
#여성                                  3.542005e-06
#ﬁtnessgirl                          3.542005e-06
#ａｅｓｔｈｅｔｉｃ                           4.958806e-05
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 3.542005e-06

[2033546 rows x 1 columns]


 24%|██▍       | 9/37 [13:23<42:50, 91.82s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6850 ms
 Frequent sequences count : 3409258
 Max memory (mb) : 979.24365234375
 minsup = 3 sequences.
 Pattern count : 3409258

Post-processing to show result in terms of string values.
Post-processing completed.

                                        caregiver
['zainspirowany']                    1.215166e-05
['wczorajszym']                      5.164458e-05
['wczorajszym', 'poście']            2.275906e-06
['wczorajszym', 'poście', 'innymi']  3.379707e-07
['wczorajszym', 'naprawdę']          1.365544e-06
...                                           ...
#猫                                   9.038690e-06
#鋸                                   4.519345e-06
#ﬁtnessgirl                          4.519345e-06
#ａｅｓｔｈｅｔｉｃ                           4.519345e-05
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 4.519345e-06

[3453316 rows x 1 columns]


 27%|██▋       | 10/37 [14:59<41:56, 93.20s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 5625 ms
 Frequent sequences count : 2845383
 Max memory (mb) : 939.92236328125
 minsup = 3 sequences.
 Pattern count : 2845383

Post-processing to show result in terms of string values.
Post-processing completed.

                                            ruler
['zainspirowany']                    1.474252e-05
['wczorajszym']                      5.897009e-05
['wczorajszym', 'poście']            2.819134e-06
['wczorajszym', 'poście', 'innymi']  4.015145e-07
['wczorajszym', 'naprawdę']          1.691481e-06
...                                           ...
#木工教室                                5.330178e-06
#猫                                   1.066036e-05
#鋸                                   5.330178e-06
#ﬁtnessgirl                          5.330178e-06
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 5.330178e-06

[2884646 rows x 1 columns]


 30%|██▉       | 11/37 [16:58<43:48, 101.09s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 5961 ms
 Frequent sequences count : 3116909
 Max memory (mb) : 1004.85595703125
 minsup = 3 sequences.
 Pattern count : 3116909

Post-processing to show result in terms of string values.
Post-processing completed.

                                          creator
['zainspirowany']                    1.355128e-05
['wczorajszym']                      5.759295e-05
['wczorajszym', 'poście']            2.647918e-06
['wczorajszym', 'poście', 'innymi']  3.704289e-07
['wczorajszym', 'naprawdę']          1.588751e-06
...                                           ...
#ﬁtnessgirl                          4.515509e-06
#ａｅｓｔｈｅｔｉｃ                           4.515509e-05
#ｔｈｏｕｇｈｔｓ                            4.515509e-06
#𝐞𝐜𝐨𝐃𝐲𝐰                              4.515509e-06
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 4.515509e-06

[3160797 rows x 1 columns]


 32%|███▏      | 12/37 [18:46<43:00, 103.22s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6545 ms
 Frequent sequences count : 1117635
 Max memory (mb) : 1224.47900390625
 minsup = 5 sequences.
 Pattern count : 1117635

Post-processing to show result in terms of string values.
Post-processing completed.

                            dominant
['zainspirowany']           0.000013
['wczorajszym']             0.000053
['wczorajszym', 'poście']   0.000002
['wczorajszym', 'przepis']  0.000002
['odnośnie']                0.000096
...                              ...
#ﬁtnessgirl                 0.000003
#ａｅｓｔｈｅｔｉｃ                  0.000042
#ｔｈｏｕｇｈｔｓ                   0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                     0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą        0.000003

[1175258 rows x 1 columns]


 35%|███▌      | 13/37 [20:49<43:38, 109.09s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6731 ms
 Frequent sequences count : 1143408
 Max memory (mb) : 1174.66796875
 minsup = 5 sequences.
 Pattern count : 1143408

Post-processing to show result in terms of string values.
Post-processing completed.

                            submissive
['zainspirowany']             0.000013
['wczorajszym']               0.000050
['wczorajszym', 'poście']     0.000002
['wczorajszym', 'przepis']    0.000002
['odnośnie']                  0.000094
...                                ...
#ﬁtnessgirl                   0.000003
#ａｅｓｔｈｅｔｉｃ                    0.000041
#ｔｈｏｕｇｈｔｓ                     0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                       0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą          0.000003

[1202417 rows x 1 columns]


 38%|███▊      | 14/37 [22:54<43:40, 113.92s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6449 ms
 Frequent sequences count : 1115610
 Max memory (mb) : 1156.72900390625
 minsup = 5 sequences.
 Pattern count : 1115610

Post-processing to show result in terms of string values.
Post-processing completed.

                            maximalist
['zainspirowany']             0.000013
['wczorajszym']               0.000055
['wczorajszym', 'poście']     0.000002
['wczorajszym', 'przepis']    0.000002
['odnośnie']                  0.000091
...                                ...
#ﬁtnessgirl                   0.000003
#ａｅｓｔｈｅｔｉｃ                    0.000041
#ｔｈｏｕｇｈｔｓ                     0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                       0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą          0.000003

[1173796 rows x 1 columns]


 41%|████      | 15/37 [25:01<43:15, 117.98s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6608 ms
 Frequent sequences count : 1128848
 Max memory (mb) : 1152.71923828125
 minsup = 5 sequences.
 Pattern count : 1128848

Post-processing to show result in terms of string values.
Post-processing completed.

                            minimalist
['zainspirowany']             0.000013
['wczorajszym']               0.000054
['wczorajszym', 'poście']     0.000002
['wczorajszym', 'przepis']    0.000002
['odnośnie']                  0.000096
...                                ...
#ﬁtnessgirl                   0.000003
#ａｅｓｔｈｅｔｉｃ                    0.000042
#ｔｈｏｕｇｈｔｓ                     0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                       0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą          0.000003

[1186302 rows x 1 columns]


 43%|████▎     | 16/37 [27:06<41:58, 119.93s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6766 ms
 Frequent sequences count : 1215650
 Max memory (mb) : 1159.3076171875
 minsup = 5 sequences.
 Pattern count : 1215650

Post-processing to show result in terms of string values.
Post-processing completed.

                            inspiring
['zainspirowany']            0.000012
['wczorajszym']              0.000054
['wczorajszym', 'poście']    0.000002
['wczorajszym', 'przepis']   0.000002
['odnośnie']                 0.000093
...                               ...
#ﬁtnessgirl                  0.000003
#ａｅｓｔｈｅｔｉｃ                   0.000041
#ｔｈｏｕｇｈｔｓ                    0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                      0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą         0.000003

[1275002 rows x 1 columns]


 46%|████▌     | 17/37 [29:17<41:03, 123.18s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6423 ms
 Frequent sequences count : 1159984
 Max memory (mb) : 1161.966796875
 minsup = 5 sequences.
 Pattern count : 1159984

Post-processing to show result in terms of string values.
Post-processing completed.

                            systematic
['zainspirowany']             0.000013
['wczorajszym']               0.000052
['wczorajszym', 'poście']     0.000002
['wczorajszym', 'przepis']    0.000002
['odnośnie']                  0.000085
...                                ...
#ﬁtnessgirl                   0.000003
#ａｅｓｔｈｅｔｉｃ                    0.000043
#ｔｈｏｕｇｈｔｓ                     0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                       0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą          0.000003

[1217298 rows x 1 columns]


 49%|████▊     | 18/37 [31:25<39:32, 124.89s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 7056 ms
 Frequent sequences count : 1222535
 Max memory (mb) : 1102.037109375
 minsup = 5 sequences.
 Pattern count : 1222535

Post-processing to show result in terms of string values.
Post-processing completed.

                            discovering
['zainspirowany']              0.000012
['wczorajszym']                0.000052
['wczorajszym', 'poście']      0.000002
['wczorajszym', 'przepis']     0.000002
['odnośnie']                   0.000092
...                                 ...
#ﬁtnessgirl                    0.000003
#ａｅｓｔｈｅｔｉｃ                     0.000040
#ｔｈｏｕｇｈｔｓ                      0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                        0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą           0.000003

[1283415 rows x 1 columns]


 51%|█████▏    | 19/37 [33:37<38:02, 126.78s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 5990 ms
 Frequent sequences count : 1091872
 Max memory (mb) : 1241.73291015625
 minsup = 5 sequences.
 Pattern count : 1091872

Post-processing to show result in terms of string values.
Post-processing completed.

                            conservative
['zainspirowany']               0.000014
['wczorajszym']                 0.000052
['wczorajszym', 'poście']       0.000002
['wczorajszym', 'przepis']      0.000002
['odnośnie']                    0.000097
...                                  ...
#ﬁtnessgirl                     0.000003
#ａｅｓｔｈｅｔｉｃ                      0.000044
#ｔｈｏｕｇｈｔｓ                       0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                         0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą            0.000003

[1148328 rows x 1 columns]


 54%|█████▍    | 20/37 [35:41<35:43, 126.11s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6299 ms
 Frequent sequences count : 1167901
 Max memory (mb) : 894.4108734130859
 minsup = 5 sequences.
 Pattern count : 1167901

Post-processing to show result in terms of string values.
Post-processing completed.

                            verifying
['zainspirowany']            0.000013
['wczorajszym']              0.000053
['wczorajszym', 'poście']    0.000002
['wczorajszym', 'przepis']   0.000002
['odnośnie']                 0.000091
...                               ...
#ﬁtnessgirl                  0.000003
#ａｅｓｔｈｅｔｉｃ                   0.000043
#ｔｈｏｕｇｈｔｓ                    0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                      0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą         0.000003

[1225118 rows x 1 columns]


 57%|█████▋    | 21/37 [37:54<34:10, 128.13s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6231 ms
 Frequent sequences count : 1144627
 Max memory (mb) : 1208.8984375
 minsup = 5 sequences.
 Pattern count : 1144627

Post-processing to show result in terms of string values.
Post-processing completed.

                            overlooking
['zainspirowany']              0.000013
['wczorajszym']                0.000053
['wczorajszym', 'przepis']     0.000002
['odnośnie']                   0.000099
['odnośnie', 'życia']          0.000003
...                                 ...
#ﬁtnessgirl                    0.000003
#ａｅｓｔｈｅｔｉｃ                     0.000044
#ｔｈｏｕｇｈｔｓ                      0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                        0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą           0.000003

[1200713 rows x 1 columns]


 59%|█████▉    | 22/37 [40:01<31:58, 127.88s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6101 ms
 Frequent sequences count : 1118895
 Max memory (mb) : 1176.20556640625
 minsup = 5 sequences.
 Pattern count : 1118895

Post-processing to show result in terms of string values.
Post-processing completed.

                            sharpening
['zainspirowany']             0.000013
['wczorajszym']               0.000053
['wczorajszym', 'poście']     0.000002
['wczorajszym', 'przepis']    0.000002
['odnośnie']                  0.000098
...                                ...
#ﬁtnessgirl                   0.000003
#ａｅｓｔｈｅｔｉｃ                    0.000013
#ｔｈｏｕｇｈｔｓ                     0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                       0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą          0.000003

[1173796 rows x 1 columns]


 62%|██████▏   | 23/37 [42:07<29:42, 127.29s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6326 ms
 Frequent sequences count : 1174126
 Max memory (mb) : 1121.947265625
 minsup = 5 sequences.
 Pattern count : 1174126

Post-processing to show result in terms of string values.
Post-processing completed.

                            harmonic
['zainspirowany']           0.000013
['wczorajszym']             0.000049
['wczorajszym', 'poście']   0.000002
['wczorajszym', 'przepis']  0.000002
['odnośnie']                0.000092
...                              ...
#ﬁtnessgirl                 0.000003
#ａｅｓｔｈｅｔｉｃ                  0.000043
#ｔｈｏｕｇｈｔｓ                   0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                     0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą        0.000003

[1232017 rows x 1 columns]


 65%|██████▍   | 24/37 [44:18<27:46, 128.19s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6587 ms
 Frequent sequences count : 1144450
 Max memory (mb) : 1168.9755859375
 minsup = 5 sequences.
 Pattern count : 1144450

Post-processing to show result in terms of string values.
Post-processing completed.

                            empathic
['zainspirowany']           0.000013
['wczorajszym']             0.000051
['wczorajszym', 'poście']   0.000002
['wczorajszym', 'przepis']  0.000002
['odnośnie']                0.000093
...                              ...
#ﬁtnessgirl                 0.000003
#ａｅｓｔｈｅｔｉｃ                  0.000042
#ｔｈｏｕｇｈｔｓ                   0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                     0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą        0.000003

[1201915 rows x 1 columns]


 68%|██████▊   | 25/37 [46:24<25:31, 127.61s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 8022 ms
 Frequent sequences count : 2269783
 Max memory (mb) : 1205.90185546875
 minsup = 4 sequences.
 Pattern count : 2269783

Post-processing to show result in terms of string values.
Post-processing completed.

                                     matter_of_fact
['zainspirowany']                      1.286281e-05
['wczorajszym']                        5.145125e-05
['wczorajszym', 'poście']              1.857884e-06
['wczorajszym', 'poście', 'innymi']    3.915640e-07
['wczorajszym', 'naprawdę']            1.486307e-06
...                                             ...
#ﬁtnessgirl                            3.227879e-06
#ａｅｓｔｈｅｔｉｃ                             4.519030e-05
#ｔｈｏｕｇｈｔｓ                              3.227879e-06
#𝐞𝐜𝐨𝐃𝐲𝐰                                3.227879e-06
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                   3.227879e-06

[2324086 rows x 1 col

 70%|███████   | 26/37 [48:28<23:13, 126.68s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 7245 ms
 Frequent sequences count : 2030430
 Max memory (mb) : 1216.26025390625
 minsup = 4 sequences.
 Pattern count : 2030430

Post-processing to show result in terms of string values.
Post-processing completed.

                                            brave
['zainspirowany']                    1.341787e-05
['wczorajszym']                      5.590777e-05
['wczorajszym', 'poście']            2.018550e-06
['wczorajszym', 'poście', 'innymi']  4.479120e-07
['wczorajszym', 'naprawdę']          1.614840e-06
...                                           ...
#ﬁtnessgirl                          3.222906e-06
#ａｅｓｔｈｅｔｉｃ                           1.289162e-05
#ｔｈｏｕｇｈｔｓ                            3.222906e-06
#𝐞𝐜𝐨𝐃𝐲𝐰                              3.222906e-06
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 3.222906e-06

[2084627 rows x 1 columns]


 73%|███████▎  | 27/37 [50:18<20:16, 121.66s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 5914 ms
 Frequent sequences count : 1084519
 Max memory (mb) : 1218.8056640625
 minsup = 5 sequences.
 Pattern count : 1084519

Post-processing to show result in terms of string values.
Post-processing completed.

                      protective
['niezmiennie']         0.000043
['nr']                  0.000090
['nr', 'nr']            0.000003
['ciasto']              0.000703
['ciasto', 'ciasto']    0.000072
...                          ...
#ﬁtnessgirl             0.000003
#ａｅｓｔｈｅｔｉｃ              0.000043
#ｔｈｏｕｇｈｔｓ               0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                 0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą    0.000003

[1140164 rows x 1 columns]


 76%|███████▌  | 28/37 [52:20<18:14, 121.62s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6713 ms
 Frequent sequences count : 2118072
 Max memory (mb) : 1098.7607421875
 minsup = 4 sequences.
 Pattern count : 2118072

Post-processing to show result in terms of string values.
Post-processing completed.

                             generous
['zainspirowany']            0.000015
['wczorajszym']              0.000058
['wczorajszym', 'poście']    0.000002
['wczorajszym', 'naprawdę']  0.000002
['wczorajszym', 'przepis']   0.000002
...                               ...
#ﬁtfam                       0.000003
#ﬁtnessgirl                  0.000003
#ａｅｓｔｈｅｔｉｃ                   0.000014
#ｔｈｏｕｇｈｔｓ                    0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                      0.000003

[2169568 rows x 1 columns]


 78%|███████▊  | 29/37 [54:06<15:34, 116.85s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6836 ms
 Frequent sequences count : 2115434
 Max memory (mb) : 1239.5400390625
 minsup = 4 sequences.
 Pattern count : 2115434

Post-processing to show result in terms of string values.
Post-processing completed.

                              thrifty
['zainspirowany']            0.000014
['wczorajszym']              0.000055
['wczorajszym', 'poście']    0.000002
['wczorajszym', 'naprawdę']  0.000002
['wczorajszym', 'przepis']   0.000002
...                               ...
#ﬁtfam                       0.000003
#ﬁtnessgirl                  0.000003
#ａｅｓｔｈｅｔｉｃ                   0.000048
#ｔｈｏｕｇｈｔｓ                    0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                      0.000003

[2166705 rows x 1 columns]


 81%|████████  | 30/37 [55:52<13:15, 113.70s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6126 ms
 Frequent sequences count : 1140295
 Max memory (mb) : 1208.818359375
 minsup = 5 sequences.
 Pattern count : 1140295

Post-processing to show result in terms of string values.
Post-processing completed.

                            favourable
['zainspirowany']             0.000013
['wczorajszym']               0.000059
['wczorajszym', 'poście']     0.000002
['wczorajszym', 'przepis']    0.000002
['odnośnie']                  0.000090
...                                ...
#ﬁtnessgirl                   0.000003
#ａｅｓｔｈｅｔｉｃ                    0.000030
#ｔｈｏｕｇｈｔｓ                     0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                       0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą          0.000003

[1196592 rows x 1 columns]


 84%|████████▍ | 31/37 [57:55<11:38, 116.43s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6179 ms
 Frequent sequences count : 1092718
 Max memory (mb) : 1244.75390625
 minsup = 5 sequences.
 Pattern count : 1092718

Post-processing to show result in terms of string values.
Post-processing completed.

                            balanced
['zainspirowany']           0.000013
['wczorajszym']             0.000054
['wczorajszym', 'poście']   0.000002
['wczorajszym', 'przepis']  0.000002
['odnośnie']                0.000094
...                              ...
#ﬁtnessgirl                 0.000003
#ａｅｓｔｈｅｔｉｃ                  0.000043
#ｔｈｏｕｇｈｔｓ                   0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                     0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą        0.000003

[1149995 rows x 1 columns]


 86%|████████▋ | 32/37 [59:59<09:54, 118.83s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6475 ms
 Frequent sequences count : 1187423
 Max memory (mb) : 1197.12744140625
 minsup = 5 sequences.
 Pattern count : 1187423

Post-processing to show result in terms of string values.
Post-processing completed.

                            sensuality
['zainspirowany']             0.000012
['wczorajszym']               0.000052
['wczorajszym', 'poście']     0.000002
['wczorajszym', 'przepis']    0.000002
['odnośnie']                  0.000085
...                                ...
#ﬁtnessgirl                   0.000003
#ａｅｓｔｈｅｔｉｃ                    0.000041
#ｔｈｏｕｇｈｔｓ                     0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                       0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą          0.000003

[1246274 rows x 1 columns]


 89%|████████▉ | 33/37 [1:02:06<08:05, 121.30s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 7809 ms
 Frequent sequences count : 2253739
 Max memory (mb) : 1122.96044921875
 minsup = 4 sequences.
 Pattern count : 2253739

Post-processing to show result in terms of string values.
Post-processing completed.

                             intelligent
['zainspirowany']               0.000013
['wczorajszym']                 0.000054
['wczorajszym', 'poście']       0.000002
['wczorajszym', 'naprawdę']     0.000002
['wczorajszym', 'przepis']      0.000002
...                                  ...
#ﬁtfam                          0.000003
#ﬁtnessgirl                     0.000003
#ａｅｓｔｈｅｔｉｃ                      0.000048
#ｔｈｏｕｇｈｔｓ                       0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą            0.000003

[2306337 rows x 1 columns]


 92%|█████████▏| 34/37 [1:04:08<06:04, 121.59s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 7450 ms
 Frequent sequences count : 2141040
 Max memory (mb) : 1197.59521484375
 minsup = 4 sequences.
 Pattern count : 2141040

Post-processing to show result in terms of string values.
Post-processing completed.

                                          believe
['zainspirowany']                    1.351147e-05
['wczorajszym']                      4.954207e-05
['wczorajszym', 'poście']            1.981674e-06
['wczorajszym', 'poście', 'innymi']  4.193610e-07
['wczorajszym', 'naprawdę']          1.585339e-06
...                                           ...
#ﬁtnessgirl                          3.349455e-06
#ａｅｓｔｈｅｔｉｃ                           4.689238e-05
#ｔｈｏｕｇｈｔｓ                            3.349455e-06
#𝐞𝐜𝐨𝐃𝐲𝐰                              3.349455e-06
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą                 3.349455e-06

[2193611 rows x 1 columns]


 95%|█████████▍| 35/37 [1:05:54<03:53, 116.76s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6317 ms
 Frequent sequences count : 1156266
 Max memory (mb) : 1158.98095703125
 minsup = 5 sequences.
 Pattern count : 1156266

Post-processing to show result in terms of string values.
Post-processing completed.

                            egocentric
['zainspirowany']             0.000013
['wczorajszym']               0.000051
['wczorajszym', 'poście']     0.000002
['wczorajszym', 'przepis']    0.000002
['odnośnie']                  0.000094
...                                ...
#ﬁtnessgirl                   0.000003
#ａｅｓｔｈｅｔｉｃ                    0.000042
#ｔｈｏｕｇｈｔｓ                     0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                       0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą          0.000003

[1213760 rows x 1 columns]


 97%|█████████▋| 36/37 [1:08:01<01:59, 119.89s/it]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 6562 ms
 Frequent sequences count : 1164640
 Max memory (mb) : 1242.158203125
 minsup = 5 sequences.
 Pattern count : 1164640

Post-processing to show result in terms of string values.
Post-processing completed.

                            allocentric
['zainspirowany']              0.000013
['wczorajszym']                0.000051
['wczorajszym', 'poście']      0.000002
['wczorajszym', 'przepis']     0.000002
['odnośnie']                   0.000090
...                                 ...
#ﬁtnessgirl                    0.000003
#ａｅｓｔｈｅｔｉｃ                     0.000042
#ｔｈｏｕｇｈｔｓ                      0.000003
#𝐞𝐜𝐨𝐃𝐲𝐰                        0.000003
#𝕠𝕔𝕙𝕛𝕒𝕜𝕗𝕒𝕛𝕟𝕚𝕖𝕓𝕪ć𝕞𝕒𝕞ą           0.000003

[1221534 rows x 1 columns]


100%|██████████| 37/37 [1:10:13<00:00, 113.89s/it]


In [24]:
# Save word_df to file
word_df.to_pickle("word_trait_array.pickle")

In [27]:
# Check the resulting DataFrame
word_df

Unnamed: 0,['zainspirowany'],['wczorajszym'],"['wczorajszym', 'poście']","['wczorajszym', 'poście', 'innymi']","['wczorajszym', 'naprawdę']","['wczorajszym', 'przepis']","['wczorajszym', 'innymi']",['odnośnie'],"['odnośnie', 'życia']","['odnośnie', 'naprawdę']",...,"['podkład', 'serii']","['konkursu', 'udział', 'nagrody']","['this', 'hope', 'you']","['zadowolony', 'temu']","['przeczytaniu', 'czytając']","['potoczą', 'czekam']","['help', 'and', 'with']","['noszenia', 'maseczek']","['best', 'much']","['care', 'it']"
innocent,1.3e-05,5.2e-05,2e-06,4.845665e-07,2e-06,2e-06,2e-06,8.1e-05,2e-06,3e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sage,1.2e-05,5.5e-05,2e-06,3.600215e-07,2e-06,1e-06,2e-06,0.000113,3e-06,2e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
explorer,1.6e-05,5.9e-05,2e-06,3.269538e-07,1e-06,0.0,2e-06,8.8e-05,2e-06,3e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
outlaw,1.4e-05,5.7e-05,3e-06,3.827676e-07,2e-06,0.0,2e-06,9.6e-05,3e-06,3e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
magician,1.5e-05,5.9e-05,3e-06,3.768889e-07,2e-06,0.0,2e-06,0.0001,3e-06,3e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hero,1.8e-05,7.4e-05,3e-06,3.79289e-07,2e-06,0.0,2e-06,8.1e-05,3e-06,2e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lover,1.2e-05,4.6e-05,3e-06,5.350438e-07,0.0,0.0,2e-06,8.5e-05,3e-06,3e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
jester,1.7e-05,5.8e-05,3e-06,3.576034e-07,2e-06,0.0,2e-06,9.2e-05,3e-06,3e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
everyman,1e-05,6e-05,2e-06,4.506976e-07,2e-06,2e-06,2e-06,9.3e-05,3e-06,3e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
caregiver,1.2e-05,5.2e-05,2e-06,3.379707e-07,1e-06,0.0,2e-06,0.000106,3e-06,3e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Replace all NaN with 0.0
word_df = word_df.fillna(0.0)

In [28]:
# Save non-NaN word_df to file
word_df.to_pickle("word_trait_array_no_nan.pickle")

In [51]:
# Save the results to a pickle
import pickle

with open("influencer_index_map.pickle", "wb") as f:
    pickle.dump(user_indices, f)
    
word_df.to_pickle("word_frequency_table.pickle")

In [40]:
# Method for creating an output vector for dot product calculation
# Word map - to easily create output vectors
word_map = word_df.columns.tolist()

def get_trait_dot_product(post_text: str, word_map: list, word_dataframe: pd.DataFrame) -> list:
    # Filter out the text
    filtered_post = remove_stopwords(clean_up_text(post_text))
    
    filtered_post = [" ".join(pst) for pst in filtered_post]
    filtered_post.extend(extract_hashtags(post_text))
    
    # Create a vector for dot product vector
    post_vector = [0] * len(word_map)
    
    # Calculate word occurrences
    spmf = Spmf("PrefixSpan", input_direct=filtered_post,
                output_filename="output.txt", arguments=[0.00025, 3], input_type="text")
    spmf.run()
    spmf_df = spmf.to_pandas_dataframe(pickle=False)
    
    for idx, row in tqdm(spmf_df.iterrows()):
        phrase = str(row["pattern"])
        freq = int(row["sup"])
        try:
            post_vector[word_map.index(phrase)] = freq
        except:
            continue
    
    # Calculate dot product for a given text
    word_dot = word_dataframe.dot(post_vector)
    return word_dot.tolist()

In [41]:
# Test the trait dot_product
print(get_trait_dot_product("Cześć reasda  asdasda         #hello #man", word_map, word_df))

0it [00:00, ?it/s]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 2 ms
 Frequent sequences count : 85
 Max memory (mb) : 8.991737365722656
 minsup = 1 sequences.
 Pattern count : 85

Post-processing to show result in terms of string values.
Post-processing completed.



85it [00:07, 12.06it/s]


[0.00016620354740696497, 0.00012749299512918806, 0.0001690490664915492, 0.00014584830353522058, 0.0001594157228390828, 0.00015536049182692842, 0.00018260170915199766, 0.00016443368012853234, 0.00016633483706746642, 0.00010936498436992098, 0.00015111084902183367, 0.00016939103921402557, 0.00017278599281551573, 0.0001591613036986155, 0.00017426378862227474, 0.00016502322112468682, 0.0001605453602390891, 0.0001723152627698697, 0.0001679472645589285, 0.00016507620641309755, 0.00016276248092296895, 0.00015667617021652205, 0.00016640781007321942, 0.00016311498103788345, 0.00015501571392168522, 0.00015649754213106778, 0.00016995963458678562, 0.00016629475070394366, 0.00018173183166301588, 0.00018088433398864235, 0.00015980665583775904, 0.00016073035875016072, 0.00016210352180292368, 0.00014350258416762895, 0.00015313004569130333, 0.00017351924885445163, 0.00015830908779736435]


In [42]:
# Method for calculating the dot product of trait <-> influencer relation
def get_influencer_dot_product(trait_output: list, influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    return influencer_dataframe.dot(trait_output)

In [43]:
# Method for calculating the similarity
def calculate_similarity(post_text: str, 
                         word_map: list, 
                         word_dataframe: pd.DataFrame,
                         influencer_dataframe: pd.DataFrame) -> pd.DataFrame:
    # Calculate word-trait dot product
    post_result = get_trait_dot_product(post_text, word_map, word_dataframe)
    
    # Calculate trate-influencer dot-product
    return get_influencer_dot_product(post_result, influencer_dataframe)

In [44]:
# Test the method
sim_df = calculate_similarity("""Jak to jest być skrybą, dobrze? 
A, wie pan, moim zdaniem to nie ma tak, że dobrze, albo że niedobrze. 
Gdybym miał powiedzieć, co cenię w życiu najbardziej, powiedziałbym, że ludzi. 
Ludzi, którzy podali mi pomocną dłoń, kiedy sobie nie radziłem, kiedy byłem sam, i co ciekawe, to właśnie przypadkowe spotkania wpływają na nasze życie. 
Chodzi o to, że kiedy wyznaje się pewne wartości, nawet pozornie uniwersalne, bywa, że nie znajduje się zrozumienia, 
które by tak rzec, które pomaga się nam rozwijać. 
Ja miałem szczęście, by tak rzec, ponieważ je znalazłem, i dziękuję życiu! 
Dziękuję mu; życie to śpiew, życie to taniec, życie to miłość! 
Wielu ludzi pyta mnie o to samo: ale jak ty to robisz, skąd czerpiesz tę radość? 
A ja odpowiadam, że to proste! To umiłowanie życia. 
To właśnie ono sprawia, że dzisiaj na przykład buduję maszyny, a jutro – kto wie? 
Dlaczego by nie – oddam się pracy społecznej i będę, ot, choćby, sadzić... doć— m-marchew...""", word_map, word_df, available_arch_df)
print("Maximum similarity:\n"
        f"User: {sim_df.idxmax()}\n"
        f"Similarity score: {sim_df.max()}")

1it [00:00,  4.30it/s]

>/media/maciek/HDD_Linux/Praca_magisterska/instagram_analysis/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 14 ms
 Frequent sequences count : 2445
 Max memory (mb) : 10.53741455078125
 minsup = 1 sequences.
 Pattern count : 2445

Post-processing to show result in terms of string values.
Post-processing completed.



2445it [03:20, 12.21it/s]


Maximum similarity:
User: muzykujemy
Similarity score: 6.420715563113335e-05
