In [54]:
# SPMF - try to get most occurring sequences
from spmf import Spmf
import pandas as pd
from text_cleaner import *
from tqdm import tqdm
import itertools

## Loading the dataset and cleaning the text

In [3]:
# Load the Twitter dataset
twitter_df = pd.read_csv('tweets_28_02_2021.csv', index_col=0)

# Print the head of the loaded dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype
0,5f9f1c36b38e10f823bf2cef,"@eliostruyf So exciting, have fun! 😊",LEGO_Group,2020-10-30 18:23:50.000,,artist
1,5f9f1c36b38e10f823bf2ce7,These Brick-O-Lanterns are certainly all treat...,LEGO_Group,2020-10-31 09:00:28.000,,artist
2,5f9f1c36b38e10f823bf2d0a,@dentistescabri Nous prenons la sécurité de no...,LEGO_Group,2020-10-30 12:07:58.000,,artist
3,5f9f1c36b38e10f823bf2cf5,@Jasmin80212446 😍🎄🥰,LEGO_Group,2020-10-30 16:35:39.000,,artist
4,5f9f1c36b38e10f823bf2d07,@ashleydrixey Sounds like a perfect fit for th...,LEGO_Group,2020-10-30 13:09:14.000,,artist


In [4]:
# Clean-up the texts
twitter_df["cleaned_text"] = twitter_df["tweet_text"].apply(lambda x: clean_up_text(x))

# Tokenize the cleaned texts
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: nltk.word_tokenize(x))

# Remove the stopwords
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: remove_stopwords(x))

# Drop the rows with empty 'cleaned_text' field
twitter_df = twitter_df.drop(twitter_df[twitter_df['cleaned_text'].map(len) < 1].index)

# Print the new head of the dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype,cleaned_text
0,5f9f1c36b38e10f823bf2cef,"@eliostruyf So exciting, have fun! 😊",LEGO_Group,2020-10-30 18:23:50.000,,artist,"[exciting, fun]"
1,5f9f1c36b38e10f823bf2ce7,These Brick-O-Lanterns are certainly all treat...,LEGO_Group,2020-10-31 09:00:28.000,,artist,"[brick, lanterns, certainly, treat, trick, get..."
2,5f9f1c36b38e10f823bf2d0a,@dentistescabri Nous prenons la sécurité de no...,LEGO_Group,2020-10-30 12:07:58.000,,artist,"[nous, prenons, la, curit, de, nos, fans, tr, ..."
4,5f9f1c36b38e10f823bf2d07,@ashleydrixey Sounds like a perfect fit for th...,LEGO_Group,2020-10-30 13:09:14.000,,artist,"[sounds, like, perfect, fit, aspiring, young, ..."
5,5f9f1c36b38e10f823bf2d08,@irgator04 What a perfect way to start your we...,LEGO_Group,2020-10-30 12:49:37.000,,artist,"[perfect, way, start, weekend]"


## Testing SPMF on 'artist' archetype

In [5]:
# Example 1 - try to get the most occurring words in the 'artist' archetype subset
# Extract all the tweets for the 'artist' archetype
artist_df = twitter_df.cleaned_text[twitter_df["archetype"] == "artist"]

# Reset the index of the subset
artist_df = artist_df.reset_index(drop=True)

# Print the head of the subset
artist_df.head()

0                                      [exciting, fun]
1    [brick, lanterns, certainly, treat, trick, get...
2    [nous, prenons, la, curit, de, nos, fans, tr, ...
3    [sounds, like, perfect, fit, aspiring, young, ...
4                       [perfect, way, start, weekend]
Name: cleaned_text, dtype: object

In [31]:
# Create a list of sentences
artist_list = [" ".join(row) for row in artist_df.tolist()]

In [36]:
# SPMF - get the most frequent sequences
spmf = Spmf("PrefixSpan", input_direct=artist_list,
            output_filename="output.txt", arguments=[0.001, 3], input_type="text")
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output.csv")

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 328 ms
 Frequent sequences count : 13662
 Max memory (mb) : 126.75393676757812
 minsup = 12 sequences.
 Pattern count : 13662

Post-processing to show result in terms of string values.
Post-processing completed.

                   pattern  sup
0               [exciting]   46
1                    [fun]  171
2               [fun, get]   14
3          [fun, building]   20
4         [fun, available]   12
...                    ...  ...
13657       [lunarnewyear]   13
13658  [blackhistorymonth]   12
13659       [goldenglobes]   14
13660     [nintendodirect]   62
13661             [sinnoh]   12

[13662 rows x 2 columns]


# Calculating most frequent sequences for all of the archetypes

In [65]:
# Generating 
archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

full_df = pd.DataFrame(columns=['chunk'] + archetype_list)

# Iterate over archetypes
for archetype in tqdm(archetype_list):
    # Extract all the tweets for the 'artist' archetype
    tmp_df = twitter_df.cleaned_text[twitter_df["archetype"] == archetype]

    # Reset the index of the subset
    tmp_df = tmp_df.reset_index(drop=True)
    
    # Calculate number of words, number of two-word and three-word combinations
    unique_words = set()
    unique_two_words = set()
    unique_three_words = set()
    
    for row in tmp_df.tolist():
        unique_words.update(row)
        unique_two_words.update(itertools.permutations(row, 2))
        unique_three_words.update(itertools.permutations(row, 3))
    
    cnt_single_word = len(unique_words)
    cnt_two_words = len(unique_two_words)
    cnt_three_words = len(unique_three_words)
    
    # Create a list of sentences
    tmp_list = [" ".join(row) for row in tmp_df.tolist()]
    
    # SPMF - get the most frequent sequences
    spmf = Spmf("PrefixSpan", input_direct=tmp_list,
                output_filename=f"sequence_files/output_{archetype}.txt", arguments=[0.001, 3], input_type="text")
    spmf.run()
    
    spmf = spmf.to_pandas_dataframe(pickle=True)
    
    # Get the TF
    spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
    spmf.sup[spmf.pattern.map(len) == 2] = spmf.sup[spmf.pattern.map(len) == 2].apply(lambda x: float(x / cnt_two_words)) 
    spmf.sup[spmf.pattern.map(len) == 3] = spmf.sup[spmf.pattern.map(len) == 3].apply(lambda x: float(x / cnt_three_words)) 
    
    print(spmf)
    spmf.to_csv(f"sequence_files/output_{archetype}.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
  8%|▊         | 1/12 [00:13<02:33, 13.95s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 288 ms
 Frequent sequences count : 13662
 Max memory (mb) : 126.65481567382812
 minsup = 12 sequences.
 Pattern count : 13662

Post-processing to show result in terms of string values.
Post-processing completed.

                   pattern       sup
0               [exciting]  0.004242
1                    [fun]  0.015768
2               [fun, get]  0.000018
3          [fun, building]  0.000026
4         [fun, available]  0.000015
...                    ...       ...
13657       [lunarnewyear]  0.001199
13658  [blackhistorymonth]  0.001107
13659       [goldenglobes]  0.001291
13660     [nintendodirect]  0.005717
13661             [sinnoh]  0.001107

[13662 rows x 2 columns]
>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 279 ms
 Frequent sequences count : 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 17%|█▋        | 2/12 [00:26<02:11, 13.20s/it]

                    pattern           sup
0                   [great]  1.425055e-02
1             [great, news]  1.833057e-05
2               [great, us]  3.462442e-05
3           [great, us, us]  6.287541e-07
4         [great, us, form]  6.287541e-07
...                     ...           ...
47059               [korea]  1.036404e-03
47060            [eritrean]  1.813706e-03
47061  [eritrean, soldiers]  1.425711e-05
47062                [view]  1.036404e-03
47063                [axum]  1.165954e-03

[47064 rows x 2 columns]
>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 222 ms
 Frequent sequences count : 23948
 Max memory (mb) : 100.57962799072266
 minsup = 9 sequences.
 Pattern count : 23948

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 25%|██▌       | 3/12 [00:34<01:37, 10.80s/it]

                                pattern           sup
0                               [hello]  4.234621e-02
1                    [hello, apologize]  1.192696e-04
2         [hello, apologize, confusion]  1.968917e-06
3             [hello, apologize, tweet]  1.968917e-06
4      [hello, apologize, accidentally]  1.968917e-06
...                                 ...           ...
23943                  [frisco, tx, tx]  9.844584e-07
23944                         [houston]  1.287554e-03
23945                     [houston, tx]  2.190666e-05
23946                         [prairie]  1.430615e-03
23947                     [prairie, tx]  2.434073e-05

[23948 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 33%|███▎      | 4/12 [00:45<01:28, 11.00s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 285 ms
 Frequent sequences count : 15553
 Max memory (mb) : 111.60638427734375
 minsup = 11 sequences.
 Pattern count : 15553

Post-processing to show result in terms of string values.
Post-processing completed.

                              pattern           sup
0                            [crisis]  2.357648e-03
1                           [reaches]  7.858827e-04
2                             [point]  1.571765e-03
3                              [team]  4.500964e-02
4                        [team, team]  1.634964e-05
...                               ...           ...
15548      [depict, including, upper]  9.614844e-07
15549    [depict, including, friends]  9.614844e-07
15550  [depict, including, peninsula]  9.614844e-07
15551     [depict, including, family]  9.614844e-07
15552                     [peninsula]  1.500321e-03

[15553 rows x 2 colum

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 42%|████▏     | 5/12 [01:04<01:35, 13.59s/it]

                     pattern       sup
0                [thousands]  0.003294
1                    [covid]  0.043062
2            [covid, around]  0.000010
3           [covid, country]  0.000010
4              [covid, even]  0.000013
...                      ...       ...
13347             [gamestop]  0.002684
13348               [reddit]  0.000854
13349         [perseverance]  0.000976
13350   [perseverance, mars]  0.000012
13351  [perseverance, rover]  0.000010

[13352 rows x 2 columns]
>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 241 ms
 Frequent sequences count : 45434
 Max memory (mb) : 114.45025634765625
 minsup = 6 sequences.
 Pattern count : 45434

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 50%|█████     | 6/12 [01:19<01:24, 14.08s/it]

                              pattern           sup
0                               [hey]  9.787928e-03
1                           [hey, tu]  5.081370e-05
2                    [hey, tu, venir]  8.182311e-07
3                       [hey, tu, en]  1.338924e-06
4                       [hey, tu, dm]  1.190154e-06
...                               ...           ...
45429       [management, visit, find]  8.926158e-07
45430  [management, visit, emergency]  9.670004e-07
45431       [management, visit, near]  4.463079e-07
45432                        [storms]  7.612833e-04
45433                          [weso]  7.612833e-04

[45434 rows x 2 columns]
>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 246 ms
 Frequent sequences count : 37534
 Max memory (mb) : 118.01627349853516
 minsup = 8 sequences.
 Pattern count : 37534

Post-processing to show result in terms of string values.
Post-processing completed

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 58%|█████▊    | 7/12 [01:30<01:06, 13.29s/it]

                       pattern           sup
0                        [due]  9.835025e-03
1                  [due, high]  7.821054e-05
2      [due, high, prioritize]  7.558529e-07
3          [due, high, demand]  3.359346e-06
4         [due, high, certain]  3.359346e-06
...                        ...           ...
37529         [valentine, day]  2.234587e-05
37530                [tagging]  1.586294e-03
37531            [tagging, us]  2.793234e-05
37532      [tagging, us, find]  7.558529e-07
37533          [tagging, find]  1.675940e-05

[37534 rows x 2 columns]
>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 245 ms
 Frequent sequences count : 28097
 Max memory (mb) : 114.41995239257812
 minsup = 11 sequences.
 Pattern count : 28097

Post-processing to show result in terms of string values.
Post-processing completed.

                 pattern       sup
0               [people]  0.014386
1       [p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 67%|██████▋   | 8/12 [01:41<00:49, 12.34s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 378 ms
 Frequent sequences count : 30194
 Max memory (mb) : 125.59005737304688
 minsup = 12 sequences.
 Pattern count : 30194

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 75%|███████▌  | 9/12 [01:56<00:39, 13.18s/it]

                   pattern           sup
0                  [offer]  7.573263e-03
1            [offer, food]  2.013247e-05
2                   [food]  1.275930e-02
3             [food, food]  3.523183e-05
4       [food, food, open]  6.099919e-07
...                    ...           ...
30189            [healthy]  1.152453e-03
30190             [amends]  1.564043e-03
30191     [amends, policy]  2.264903e-05
30192              [jenny]  9.878169e-04
30193  [blackhistorymonth]  1.070135e-03

[30194 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 83%|████████▎ | 10/12 [02:06<00:24, 12.42s/it]

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 194 ms
 Frequent sequences count : 18563
 Max memory (mb) : 88.94873046875
 minsup = 6 sequences.
 Pattern count : 18563

Post-processing to show result in terms of string values.
Post-processing completed.

                 pattern       sup
0              [mission]  0.003156
1                 [need]  0.010438
2           [need, help]  0.000029
3             [need, us]  0.000015
4            [need, new]  0.000021
...                  ...       ...
18558    [valentinesday]  0.000728
18559      [bankbalance]  0.010074
18560               [vh]  0.000728
18561            [fauci]  0.001456
18562  [fauci, whataday]  0.000017

[18563 rows x 2 columns]
>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 257 ms
 Frequent sequences count : 16649
 Max memory (mb) : 101.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))
 92%|█████████▏| 11/12 [02:17<00:11, 11.82s/it]

                          pattern           sup
0                           [aww]  9.980040e-04
1                         [thank]  3.629105e-02
2                   [thank, much]  3.595630e-05
3               [thank, much, us]  7.855333e-07
4                   [thank, kind]  1.926231e-05
...                           ...           ...
16644  [blackhistorymonth, black]  1.284154e-05
16645               [mckinseybhm]  1.542370e-03
16646                       [bhm]  9.072764e-04
16647                 [valentine]  1.179459e-03
16648             [valentinesday]  1.088732e-03

[16649 rows x 2 columns]
>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 255 ms
 Frequent sequences count : 71403
 Max memory (mb) : 104.82957458496094
 minsup = 4 sequences.
 Pattern count : 71403

Post-processing to show result in terms of string values.
Post-processing completed.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spmf.sup[spmf.pattern.map(len) == 1] = spmf.sup[spmf.pattern.map(len) == 1].apply(lambda x: float(x / cnt_single_word))


                             pattern           sup
0                            [story]  7.396915e-03
1                       [story, kim]  1.126215e-05
2                [story, kim, jones]  4.750281e-07
3                     [story, jones]  1.126215e-05
4                        [story, us]  1.970876e-05
...                              ...           ...
71398  [beauvais, arthur, kersauson]  4.750281e-07
71399         [beauvais, arthur, de]  4.750281e-07
71400          [beauvais, kersauson]  1.126215e-05
71401                 [beauvais, de]  1.126215e-05
71402      [beauvais, de, kersauson]  4.750281e-07

[71403 rows x 2 columns]


100%|██████████| 12/12 [02:27<00:00, 12.33s/it]


In [74]:
# Merge all files into a single dataframe
total_df = pd.DataFrame(columns=["pattern"] + archetype_list)
for archetype in tqdm(archetype_list):
    # Read file
    new_df = pd.read_csv(f"sequence_files/output_{archetype}.csv", sep=",", index_col=0)
    new_df = new_df.rename(columns={"sup": archetype})
    
    # Merge it with the other ones
    frames = [total_df, new_df]
    total_df = pd.concat(frames, ignore_index=True)

100%|██████████| 12/12 [00:00<00:00, 12.91it/s]


In [75]:
# Print dataframe
total_df

Unnamed: 0,pattern,artist,caregiver,everyman,explorer,guru,hero,innocent,jester,magician,rebel,ruler,seducer
0,['exciting'],0.004242,,,,,,,,,,,
1,['fun'],0.015768,,,,,,,,,,,
2,"['fun', 'get']",0.000018,,,,,,,,,,,
3,"['fun', 'building']",0.000026,,,,,,,,,,,
4,"['fun', 'available']",0.000015,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
361448,"['beauvais', 'arthur', 'kersauson']",,,,,,,,,,,,4.75028e-07
361449,"['beauvais', 'arthur', 'de']",,,,,,,,,,,,4.75028e-07
361450,"['beauvais', 'kersauson']",,,,,,,,,,,,1.12621e-05
361451,"['beauvais', 'de']",,,,,,,,,,,,1.12621e-05


In [77]:
aggregate_func = {
    "pattern": "first",
    "artist": "sum",
    "caregiver": "sum",
    "everyman": "sum",
    "explorer": "sum",
    "guru": "sum",
    "hero": "sum",
    "innocent": "sum",
    "jester": "sum",
    "magician": "sum",
    "rebel": "sum",
    "ruler": "sum",
    "seducer": "sum"
}
total_df = total_df.groupby("pattern").aggregate(aggregate_func)
total_df = total_df.reset_index(drop=True)

In [78]:
# Print the full DataFrame
total_df

Unnamed: 0,pattern,artist,caregiver,everyman,explorer,guru,hero,innocent,jester,magician,rebel,ruler,seducer
0,['aaron'],0.000000,0.000000,0.0,0.000786,0.000000,0.000653,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,['abbie'],0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.003622,0.000000,0.000000,0.0
2,['ability'],0.000000,0.000000,0.0,0.001072,0.000793,0.001740,0.000000,0.000000,0.000000,0.000971,0.000000,0.0
3,"['able', 'able']",0.000042,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,"['able', 'access']",0.000000,0.000000,0.0,0.000000,0.000000,0.000012,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
305816,['zip'],0.000000,0.003239,0.0,0.000000,0.000000,0.003045,0.000000,0.000000,0.000000,0.000728,0.000000,0.0
305817,['zone'],0.001107,0.000000,0.0,0.000000,0.000000,0.000870,0.000000,0.000000,0.000000,0.000000,0.000998,0.0
305818,['zoom'],0.002029,0.000000,0.0,0.000000,0.001220,0.000000,0.000000,0.001406,0.000000,0.000000,0.000000,0.0
305819,['zu'],0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.001904,0.000000,0.000000,0.000000,0.000000,0.0


In [79]:
# Save the current DataFrame
total_df.to_csv("phrase_frequency_no_df.csv")

In [None]:
# Calculate document frequency for every archetype
import math

total_df = pd.read_csv("phrase_frequency_no_df.csv", index_col=0)

for archetype in archetype_list:
    print(f"Archetype {archetype}:")
    
    twitter_subset = twitter_df.cleaned_text[twitter_df["archetype"] == archetype]
    
    # Select non-zero elements and calculate DF for every element
    for _, row in tqdm(total_df.iterrows()):
        if row[archetype] > 0.0:
            phrase = row[archetype]
            phrase_cnt = 0
            for line in twitter_subset:
                if phrase in line:
                    phrase_cnt += 1

            row[archetype] = row[archetype] * math.log(len(twitter_df) / (phrase_cnt + 1))      

  0%|          | 0/12 [00:00<?, ?it/s]