In [1]:
# SPMF - try to get most occurring sequences
from spmf import Spmf
import pandas as pd
from text_cleaner import *
from tqdm import tqdm
import itertools

archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

## Loading the dataset and cleaning the text

In [2]:
# Load the Twitter dataset
twitter_df = pd.read_csv('tweets_06_03_2021.csv', index_col=0)

# Print the head of the loaded dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype
0,5f9f1c36b38e10f823bf2cdc,"@AndruEdwards The hard work has paid off, this...",LEGO_Group,2020-11-01 19:32:05.000,,artist
1,5f9f1c36b38e10f823bf2cdd,@soosupersam A great way to surprise your love...,LEGO_Group,2020-11-01 19:09:40.000,,artist
2,5f9f1c36b38e10f823bf2cde,"You can now just bring the fun home, and reliv...",LEGO_Group,2020-11-01 14:00:36.000,,artist
3,5f9f1c36b38e10f823bf2cdf,@at_knb Happy birthday to the master builder! ...,LEGO_Group,2020-10-31 17:16:57.000,,artist
4,5f9f1c36b38e10f823bf2ce0,@dizunatsu 😀😀,LEGO_Group,2020-10-31 15:18:50.000,,artist


In [3]:
# Clean-up the texts
twitter_df["cleaned_text"] = twitter_df["tweet_text"].apply(lambda x: clean_up_text(x))

# Tokenize the cleaned texts
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: nltk.word_tokenize(x))

# Remove the stopwords
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: remove_stopwords(x))

# Drop the rows with empty 'cleaned_text' field
twitter_df = twitter_df.drop(twitter_df[twitter_df['cleaned_text'].map(len) < 1].index)

# Print the new head of the dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype,cleaned_text
0,5f9f1c36b38e10f823bf2cdc,"@AndruEdwards The hard work has paid off, this...",LEGO_Group,2020-11-01 19:32:05.000,,artist,"[hard, work, paid, awesome]"
1,5f9f1c36b38e10f823bf2cdd,@soosupersam A great way to surprise your love...,LEGO_Group,2020-11-01 19:09:40.000,,artist,"[great, way, surprise, loved, one]"
2,5f9f1c36b38e10f823bf2cde,"You can now just bring the fun home, and reliv...",LEGO_Group,2020-11-01 14:00:36.000,,artist,"[bring, fun, home, relive, favorite, childhood..."
3,5f9f1c36b38e10f823bf2cdf,@at_knb Happy birthday to the master builder! ...,LEGO_Group,2020-10-31 17:16:57.000,,artist,"[happy, birthday, master, builder, hope, magic..."
6,5f9f1c36b38e10f823bf2ce2,@Ranchie This is the way! 😀,LEGO_Group,2020-10-31 15:16:26.000,,artist,[way]


## Testing SPMF on 'artist' archetype

In [4]:
# Example 1 - try to get the most occurring words in the 'artist' archetype subset
# Extract all the tweets for the 'artist' archetype
artist_df = twitter_df.cleaned_text[twitter_df["archetype"] == "artist"]

# Reset the index of the subset
artist_df = artist_df.reset_index(drop=True)

# Print the head of the subset
artist_df.head()

0                          [hard, work, paid, awesome]
1                   [great, way, surprise, loved, one]
2    [bring, fun, home, relive, favorite, childhood...
3    [happy, birthday, master, builder, hope, magic...
4                                                [way]
Name: cleaned_text, dtype: object

In [5]:
# Create a list of sentences
artist_list = [" ".join(row) for row in artist_df.tolist()]

In [8]:
# SPMF - get the most frequent sequences
spmf = Spmf("PrefixSpan", input_direct=artist_list,
            output_filename="output.txt", arguments=[0.0003, 3], input_type="text")
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output.csv")

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 524 ms
 Frequent sequences count : 219786
 Max memory (mb) : 139.26904296875
 minsup = 4 sequences.
 Pattern count : 219786

Post-processing to show result in terms of string values.
Post-processing completed.

                              pattern  sup
0                              [hard]  137
1                      [hard, placed]    5
2                        [hard, work]   18
3                 [hard, work, every]    4
4              [hard, work, everyone]    4
...                               ...  ...
219781              [ds, later, year]    4
219782                     [ds, year]    4
219783                  [swatchxmoma]    6
219784  [swatchxmoma, swatchlovesart]    6
219785                           [bp]    4

[219786 rows x 2 columns]


# Calculating most frequent sequences for all of the archetypes

In [9]:
full_df = pd.DataFrame(columns=['chunk'] + archetype_list)

# Iterate over archetypes
for archetype in tqdm(archetype_list):
    # Extract all the tweets for the 'artist' archetype
    tmp_df = twitter_df.cleaned_text[twitter_df["archetype"] == archetype]

    # Reset the index of the subset
    tmp_df = tmp_df.reset_index(drop=True)
    
    # Calculate number of words, number of two-word and three-word combinations
    unique_words = set()
    unique_two_words = set()
    unique_three_words = set()
    
    for row in tmp_df.tolist():
        unique_words.update(row)
        unique_two_words.update(itertools.permutations(row, 2))
        unique_three_words.update(itertools.permutations(row, 3))
    
    cnt_single_word = len(unique_words)
    cnt_two_words = len(unique_two_words)
    cnt_three_words = len(unique_three_words)
    
    # Create a list of sentences
    tmp_list = [" ".join(row) for row in tmp_df.tolist()]
    
    # SPMF - get the most frequent sequences
    spmf = Spmf("PrefixSpan", input_direct=tmp_list,
                output_filename=f"sequence_files_four/output_{archetype}.txt", arguments=[0.0003, 3], input_type="text")
    spmf.run()
    
    spmf = spmf.to_pandas_dataframe(pickle=True)
    
    # Get the TF
    spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
    spmf.sup[spmf.pattern.map(len) == 2] = spmf.sup[spmf.pattern.map(len) == 2].apply(lambda x: float(x / cnt_two_words)) 
    spmf.sup[spmf.pattern.map(len) == 3] = spmf.sup[spmf.pattern.map(len) == 3].apply(lambda x: float(x / cnt_three_words)) 
    
    print(spmf)
    spmf.to_csv(f"sequence_files/output_{archetype}.csv")

  0%|          | 0/12 [00:00<?, ?it/s]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 461 ms
 Frequent sequences count : 219786
 Max memory (mb) : 183.41357421875
 minsup = 4 sequences.
 Pattern count : 219786

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                              pattern           sup
0                              [hard]  1.221796e-02
1                      [hard, placed]  6.083746e-06
2                        [hard, work]  2.190149e-05
3                 [hard, work, every]  2.133147e-07
4              [hard, work, everyone]  2.133147e-07
...                               ...           ...
219781              [ds, later, year]  2.133147e-07
219782                     [ds, year]  4.866997e-06
219783                  [swatchxmoma]  5.350932e-04
219784  [swatchxmoma, swatchlovesart]  7.300496e-06
219785                           [bp]  3.567288e-04

[219786 rows x 2 columns]


  8%|▊         | 1/12 [00:12<02:20, 12.76s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 654 ms
 Frequent sequences count : 920767
 Max memory (mb) : 214.2587890625
 minsup = 2 sequences.
 Pattern count : 920767

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                               pattern           sup
0                              [sorry]  1.504348e-01
1                        [sorry, hear]  2.143125e-03
2                  [sorry, hear, hear]  2.496816e-07
3                   [sorry, hear, amy]  3.329088e-07
4                [sorry, hear, moment]  9.987263e-07
...                                ...           ...
920762   [academics, diluting, rights]  1.664544e-07
920763                        [accord]  2.484472e-04
920764        [accord, humanrightsact]  3.789788e-06
920765  [accord, humanrightsact, read]  1.664544e-07
920766                  [accord, read]  3.789788e-06

[920767 rows x 2 columns]


 17%|█▋        | 2/12 [00:37<03:18, 19.82s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 309 ms
 Frequent sequences count : 147306
 Max memory (mb) : 106.28164672851562
 minsup = 3 sequences.
 Pattern count : 147306

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                                 pattern           sup
0                                  [bud]  1.337979e-02
1                             [bud, bud]  1.399012e-05
2                            [bud, wait]  9.326749e-06
3                           [bud, happy]  6.995061e-06
4                              [bud, us]  9.326749e-06
...                                  ...           ...
147301       [praised, emissions, point]  3.123981e-07
147302       [praised, emissions, price]  3.123981e-07
147303  [praised, emissions, accessible]  3.123981e-07
147304               [praised, electric]  6.995061e-06
147305                         [talents]  4.181185e-04

[147306 rows x 2 columns]


 25%|██▌       | 3/12 [00:46<02:15, 15.07s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 405 ms
 Frequent sequences count : 90588
 Max memory (mb) : 140.56525421142578
 minsup = 4 sequences.
 Pattern count : 90588

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                        pattern           sup
0                      [maggie]  3.452562e-04
1           [maggie, continues]  3.640242e-06
2      [maggie, continues, sun]  1.726970e-07
3        [maggie, continues, c]  1.726970e-07
4       [maggie, maggiecolepbs]  4.550303e-06
...                         ...           ...
90583                 [finance]  2.762049e-04
90584               [codedgaze]  2.762049e-04
90585      [womenshistorymonth]  4.143074e-04
90586        [worldwildlifeday]  3.452562e-04
90587                     [wwd]  6.214611e-04

[90588 rows x 2 columns]


 33%|███▎      | 4/12 [00:59<01:53, 14.19s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 432 ms
 Frequent sequences count : 64913
 Max memory (mb) : 212.9957275390625
 minsup = 4 sequences.
 Pattern count : 64913

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 42%|████▏     | 5/12 [01:17<01:49, 15.60s/it]

                          pattern           sup
0                          [crew]  5.885150e-03
1                    [crew, crew]  1.296159e-05
2           [crew, crew, mission]  1.871370e-07
3                 [crew, crew, p]  2.994193e-07
4                [crew, crew, et]  2.245644e-07
...                           ...           ...
64908     [garde, explore, klein]  1.497096e-07
64909                  [sorrenti]  2.377839e-04
64910            [sorrenti, shop]  3.049787e-06
64911  [sorrenti, shop, campaign]  1.497096e-07
64912        [sorrenti, campaign]  3.049787e-06

[64913 rows x 2 columns]
>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 494 ms
 Frequent sequences count : 837069
 Max memory (mb) : 161.158203125
 minsup = 2 sequences.
 Pattern count : 837069

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                             pattern           sup
0                               [hi]  3.294492e-02
1                        [hi, sorry]  1.098232e-04
2                  [hi, sorry, tell]  1.413449e-07
3                  [hi, sorry, feel]  1.413449e-07
4                   [hi, sorry, way]  2.826899e-07
...                              ...           ...
837064              [peacecorpsweek]  2.118644e-04
837065         [peacecorpsweek, amp]  3.230094e-06
837066    [peacecorpsweek, amp, amp]  1.413449e-07
837067  [peacecorpsweek, amp, learn]  1.413449e-07
837068       [peacecorpsweek, learn]  3.230094e-06

[837069 rows x 2 columns]


 50%|█████     | 6/12 [01:43<01:54, 19.13s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 396 ms
 Frequent sequences count : 146492
 Max memory (mb) : 210.5414810180664
 minsup = 3 sequences.
 Pattern count : 146492

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                                 pattern           sup
0                           [appreciate]  2.982474e-02
1                  [appreciate, loyalty]  2.192555e-04
2           [appreciate, loyalty, order]  2.583632e-06
3            [appreciate, loyalty, meet]  6.419935e-06
4       [appreciate, loyalty, extremely]  6.263351e-06
...                                  ...           ...
146487            [salary, hint, people]  2.348757e-07
146488            [salary, hint, reckon]  2.348757e-07
146489           [salary, hint, similar]  2.348757e-07
146490             [salary, hint, share]  2.348757e-07
146491            [salary, hint, though]  2.348757e-07

[146492 rows x 2 columns]


 58%|█████▊    | 7/12 [01:55<01:22, 16.55s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 312 ms
 Frequent sequences count : 111339
 Max memory (mb) : 126.9611587524414
 minsup = 4 sequences.
 Pattern count : 111339

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                                pattern           sup
0                               [thank]  7.427204e-02
1                        [thank, thank]  9.997317e-06
2                      [thank, support]  7.164744e-05
3              [thank, support, global]  3.547882e-07
4                        [thank, store]  1.666220e-05
...                                 ...           ...
111334  [nationalpeanutbutterloversday]  3.191065e-04
111335                   [fallonvision]  3.191065e-04
111336                          [stout]  3.191065e-04
111337                   [stout, march]  6.664878e-06
111338            [foodwasteactionweek]  3.988831e-04

[111339 rows x 2 columns]


 67%|██████▋   | 8/12 [02:06<01:00, 15.06s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 484 ms
 Frequent sequences count : 183911
 Max memory (mb) : 178.09271240234375
 minsup = 4 sequences.
 Pattern count : 183911

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                    pattern           sup
0                   [sorry]  9.451978e-02
1            [sorry, helps]  4.831379e-06
2            [sorry, sorry]  9.662758e-06
3             [sorry, hear]  6.099616e-04
4       [sorry, hear, hear]  1.942611e-07
...                     ...           ...
183906           [worklife]  6.419000e-04
183907              [audio]  7.221375e-04
183908       [audio, start]  4.831379e-06
183909  [audio, collective]  6.039224e-06
183910            [tedpods]  8.023750e-04

[183911 rows x 2 columns]


 75%|███████▌  | 9/12 [02:21<00:45, 15.03s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 415 ms
 Frequent sequences count : 571047
 Max memory (mb) : 225.21741485595703
 minsup = 2 sequences.
 Pattern count : 571047

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                     pattern           sup
0                    [fiery]  2.349348e-04
1              [fiery, warm]  3.989890e-06
2       [fiery, warm, tones]  1.979696e-07
3       [fiery, warm, drawn]  1.979696e-07
4       [fiery, warm, every]  1.979696e-07
...                      ...           ...
571042                  [ge]  2.349348e-04
571043            [ge, heme]  3.989890e-06
571044           [ge, yeast]  3.989890e-06
571045         [ge, produce]  3.989890e-06
571046   [ge, produce, heme]  1.979696e-07

[571047 rows x 2 columns]


 83%|████████▎ | 10/12 [02:40<00:32, 16.19s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 414 ms
 Frequent sequences count : 316796
 Max memory (mb) : 178.33123779296875
 minsup = 3 sequences.
 Pattern count : 316796

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                                 pattern           sup
0                                 [join]  1.136163e-02
1                        [join, tonight]  4.896626e-06
2                    [join, tonight, pm]  2.292151e-07
3                             [join, pm]  1.591403e-05
4                         [join, pm, et]  1.719113e-07
...                                  ...           ...
316791             [virtuallearning, us]  3.672470e-06
316792         [virtuallearning, around]  3.672470e-06
316793  [virtuallearning, around, world]  1.719113e-07
316794   [virtuallearning, around, told]  1.719113e-07
316795     [virtuallearning, around, us]  1.719113e-07

[316796 rows x 2 columns]


 92%|█████████▏| 11/12 [02:55<00:15, 15.76s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 384 ms
 Frequent sequences count : 540601
 Max memory (mb) : 114.53091430664062
 minsup = 2 sequences.
 Pattern count : 540601

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                                pattern           sup
0                             [holiday]  1.682073e-02
1                    [holiday, holiday]  2.112038e-05
2             [holiday, holiday, boxes]  2.199042e-07
3               [holiday, holiday, box]  2.199042e-07
4            [holiday, holiday, spirit]  2.199042e-07
...                                 ...           ...
540596         [galgadot, tiffanyandco]  5.280096e-06
540597                [tiffanybluebook]  3.030762e-04
540598  [tiffanybluebook, tiffanyandco]  5.280096e-06
540599                   [goldenglobes]  6.061524e-04
540600     [goldenglobes, tiffanyandco]  7.920144e-06

[540601 rows x 2 columns]


100%|██████████| 12/12 [03:14<00:00, 16.17s/it]


In [11]:
# Merge all files into a single dataframe
total_df = pd.DataFrame(columns=["pattern"] + archetype_list)
for archetype in tqdm(archetype_list):    
    # Read file and merge it with the other ones
    total_df = pd.concat([total_df, pd.read_csv(f"sequence_files/output_{archetype}.csv", sep=",", index_col=0).rename(columns={"sup": archetype})], ignore_index=True)

100%|██████████| 12/12 [00:14<00:00,  1.17s/it]


In [12]:
# Print dataframe
total_df

Unnamed: 0,pattern,artist,caregiver,everyman,explorer,guru,hero,innocent,jester,magician,rebel,ruler,seducer
0,['hard'],1.221796e-02,,,,,,,,,,,
1,"['hard', 'placed']",6.083746e-06,,,,,,,,,,,
2,"['hard', 'work']",2.190149e-05,,,,,,,,,,,
3,"['hard', 'work', 'every']",2.133147e-07,,,,,,,,,,,
4,"['hard', 'work', 'everyone']",2.133147e-07,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4150610,"['galgadot', 'tiffanyandco']",,,,,,,,,,,,5.2801e-06
4150611,['tiffanybluebook'],,,,,,,,,,,,0.000303076
4150612,"['tiffanybluebook', 'tiffanyandco']",,,,,,,,,,,,5.2801e-06
4150613,['goldenglobes'],,,,,,,,,,,,0.000606152


In [13]:
aggregate_func = {
    "pattern": "first",
    "artist": "sum",
    "caregiver": "sum",
    "everyman": "sum",
    "explorer": "sum",
    "guru": "sum",
    "hero": "sum",
    "innocent": "sum",
    "jester": "sum",
    "magician": "sum",
    "rebel": "sum",
    "ruler": "sum",
    "seducer": "sum"
}
total_df = total_df.groupby("pattern").aggregate(aggregate_func)
total_df = total_df.reset_index(drop=True)

KeyboardInterrupt: 

In [None]:
# Print the full DataFrame
total_df

In [None]:
# Save the current DataFrame
total_df.to_csv("sequence_files_four/phrase_frequency_no_df.csv")

In [None]:
# Calculate document frequency for every archetype
import math
import ast
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

def get_doc_freq(phrase, sup, dataset):
    phrase_cnt = 0
    for line in dataset:
        if tuple(phrase) in line:
            phrase_cnt += 1
    return sup * math.log(len(dataset) / (phrase_cnt + 1))
    

for archetype in archetype_list:
    print(f"Archetype {archetype}:")
    tmp_df = pd.read_csv(f"sequence_files/output_{archetype}.csv").set_index("Unnamed: 0")
    tmp_df["pattern"] = tmp_df["pattern"].apply(lambda x: ast.literal_eval(x))
    
    tmp_df = dd.from_pandas(tmp_df, 10000)
    
    twitter_subset = twitter_df.cleaned_text[twitter_df["archetype"] == archetype]
    
    # Select non-zero elements and calculate DF for every element
    with ProgressBar():
        tmp_df["sup"] = tmp_df.apply(lambda row: get_doc_freq(row["pattern"], row["sup"], twitter_subset), axis=1, meta=(float)).compute()

    tmp_df.to_csv(f"sequence_files_four/output_{archetype}_df.csv")

Archetype artist:
[########################################] | 100% Completed | 15min 32.3s
Archetype caregiver:
[###########                             ] | 28% Completed | 11min 44.1s

In [None]:
# Save the DataFrame with document frequency calculated
total_df.to_csv("sequence_files/phrase_frequency_with_df.csv")

In [None]:
# Print the new DataFrame
total_df