In [111]:
# Most popular word/n-gram extraction

import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from tqdm import tqdm

# Stopwords object
stop_words = set(stopwords.words('english')) 

# Regular expressions used for the text cleaning
URL_PATTERN = re.compile(r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)")
USERNAME_PATTERN = re.compile('@[^\s]+')
ALPHA_PATTERN = re.compile(r"[^a-zA-Z]")
LONG_SEQ_PATTERN = re.compile(r"(.)\1\1+")
REPL_SEQ_PATTERN = r"\1\1"

def clean_up_text(tweet: str) -> str:
    """Cleans up the Tweet's text from emojis, usernames, links, etc."""
    lc_tweet = tweet.lower()
    
    # Remove unnecessary characters
    lc_tweet = re.sub(URL_PATTERN, '', lc_tweet)
    lc_tweet = re.sub(USERNAME_PATTERN, '', lc_tweet)
    lc_tweet = re.sub(ALPHA_PATTERN, ' ', lc_tweet)
    lc_tweet = re.sub(LONG_SEQ_PATTERN, REPL_SEQ_PATTERN, lc_tweet)
    
    return lc_tweet    

def remove_stopwords(word_list: list) -> list:
    """Removes the stopwords from the tokenized tweet."""
    out_list = []
    for word in word_list:
        if word not in stop_words:
            out_list.append(word)
            
    return out_list

In [57]:
# Load the Twitter dataset
twitter_df = pd.read_csv('tweets_21_02_2021.csv', index_col=0)

# Print the head of the loaded dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype
0,5f9f1c36b38e10f823bf2cdc,"@AndruEdwards The hard work has paid off, this...",LEGO_Group,2020-11-01 19:32:05.000,,artist
1,5f9f1c36b38e10f823bf2cdd,@soosupersam A great way to surprise your love...,LEGO_Group,2020-11-01 19:09:40.000,,artist
2,5f9f1c36b38e10f823bf2cde,"You can now just bring the fun home, and reliv...",LEGO_Group,2020-11-01 14:00:36.000,,artist
3,5f9f1c36b38e10f823bf2cdf,@at_knb Happy birthday to the master builder! ...,LEGO_Group,2020-10-31 17:16:57.000,,artist
4,5f9f1c36b38e10f823bf2ce0,@dizunatsu 😀😀,LEGO_Group,2020-10-31 15:18:50.000,,artist


In [58]:
# Clean-up the texts
twitter_df["cleaned_text"] = twitter_df["tweet_text"].apply(lambda x: clean_up_text(x))

# Tokenize the cleaned texts
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: nltk.word_tokenize(x))

# Remove the stopwords
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: remove_stopwords(x))

# Drop the rows with empty 'cleaned_text' field
twitter_df = twitter_df.drop(twitter_df[twitter_df['cleaned_text'].map(len) < 1].index)

# Print the new head of the dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype,cleaned_text
0,5f9f1c36b38e10f823bf2cdc,"@AndruEdwards The hard work has paid off, this...",LEGO_Group,2020-11-01 19:32:05.000,,artist,"[hard, work, paid, awesome]"
1,5f9f1c36b38e10f823bf2cdd,@soosupersam A great way to surprise your love...,LEGO_Group,2020-11-01 19:09:40.000,,artist,"[great, way, surprise, loved, one]"
2,5f9f1c36b38e10f823bf2cde,"You can now just bring the fun home, and reliv...",LEGO_Group,2020-11-01 14:00:36.000,,artist,"[bring, fun, home, relive, favorite, childhood..."
3,5f9f1c36b38e10f823bf2cdf,@at_knb Happy birthday to the master builder! ...,LEGO_Group,2020-10-31 17:16:57.000,,artist,"[happy, birthday, master, builder, hope, magic..."
6,5f9f1c36b38e10f823bf2ce2,@Ranchie This is the way! 😀,LEGO_Group,2020-10-31 15:16:26.000,,artist,[way]


In [92]:
# Example 1 - try to get the most occurring words in the 'artist' archetype subset
# Extract all the tweets for the 'artist' archetype
artist_df = twitter_df.cleaned_text[twitter_df["archetype"] == "artist"]

# Reset the index of the subset
artist_df = artist_df.reset_index(drop=True)

# Print the head of the subset
artist_df.head()

0                          [hard, work, paid, awesome]
1                   [great, way, surprise, loved, one]
2    [bring, fun, home, relive, favorite, childhood...
3    [happy, birthday, master, builder, hope, magic...
4                                                [way]
Name: cleaned_text, dtype: object

In [93]:
# Extract the most often occuring words in the subset
fdist = nltk.FreqDist(word for text in artist_df.tolist() for word in text)

# Get the word list sorted by frequency
sorted_words = fdist.most_common(len(fdist))

In [94]:
# Get the unique word count in the archetype
word_cnt = 0
for k,v in sorted_words:
    word_cnt += v
print(f"Word count in artist archetype: {word_cnt}")

Word count in artist archetype: 125656


In [95]:
# Create a dataframe with word frequency
freq_df = pd.DataFrame(sorted_words, columns=["chunk", "frequency"])

# Print the head of the dataset
freq_df.head()

Unnamed: 0,chunk,frequency
0,us,1373
1,get,1051
2,new,859
3,team,852
4,hi,761


In [96]:
# Create the coefficient based on the word appearance in a given corpus
freq_df["frequency"] = freq_df["frequency"].apply(lambda x: float(x) / word_cnt)

# Print the head of the dataset
freq_df.head()

Unnamed: 0,chunk,frequency
0,us,0.010927
1,get,0.008364
2,new,0.006836
3,team,0.00678
4,hi,0.006056


In [105]:
# Get the bigram and trigram count in the given corpus
from collections import Counter
from nltk.util import bigrams, trigrams

bigrams = artist_df.apply(lambda row: list(nltk.ngrams(row, 2)))
trigrams = artist_df.apply(lambda row: list(nltk.ngrams(row, 3)))

bigrams_freq = nltk.FreqDist(gram for text in bigrams for gram in text)
trigrams_freq = nltk.FreqDist(gram for text in trigrams for gram in text)
                                                        
# Sort bi- and trigrams
sorted_bigrams = bigrams_freq.most_common(len(bigrams_freq))
sorted_trigrams = trigrams_freq.most_common(len(trigrams_freq))

In [106]:
# Get the unique n-gram count in the archetype
bi_cnt = 0
for k,v in sorted_bigrams:
    bi_cnt += v
print(f"Bigram count in artist archetype: {bi_cnt}")

tri_cnt = 0
for k,v in sorted_trigrams:
    tri_cnt += v
print(f"Trigram count in artist archetype: {tri_cnt}")

Bigram count in artist archetype: 115081
Trigram count in artist archetype: 104977


In [107]:
# Create coefficients for n-grams
bigram_freq = pd.DataFrame(sorted_bigrams, columns=["chunk", "frequency"])
trigram_freq = pd.DataFrame(sorted_trigrams, columns=["chunk", "frequency"])

# Scale the coefficients
bigram_freq["frequency"] = bigram_freq["frequency"].apply(lambda x: float(x) / bi_cnt)
trigram_freq["frequency"] = trigram_freq["frequency"].apply(lambda x: float(x) / tri_cnt)

# Concatenate the frames
frames = [freq_df, bigram_freq, trigram_freq]
full_freq = pd.concat(frames, ignore_index=True)

In [108]:
# Print the new frequency DataFrame
print(full_freq)

                              chunk  frequency
0                                us   0.010927
1                               get   0.008364
2                               new   0.006836
3                              team   0.006780
4                                hi   0.006056
...                             ...        ...
127825   (seen, lookalikes, celebs)   0.000010
127826  (lookalikes, celebs, great)   0.000010
127827        (celebs, great, love)   0.000010
127828          (great, love, real)   0.000010
127829           (love, real, best)   0.000010

[127830 rows x 2 columns]


In [137]:
# Create full frequency table for all of the archetypes
archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

full_df = pd.DataFrame(columns=['chunk'] + archetype_list)


for archetype in tqdm(archetype_list):
    bigrams = None
    trigrams = None
    bigrams_freq = None
    trigrams_freq = None
    tmp_df = None
    
    tmp_df = twitter_df.cleaned_text[twitter_df["archetype"] == archetype]

    # Reset the index of the subset
    tmp_df = tmp_df.reset_index(drop=True)
    
    # Extract the most often occuring words in the subset
    fdist = nltk.FreqDist(word for text in tmp_df.tolist() for word in text)

    # Get the word list sorted by frequency
    sorted_words = fdist.most_common(len(fdist))
    
    # Create a dataframe with word frequency
    freq_df = pd.DataFrame(sorted_words, columns=["chunk", archetype])
    
    # Create the coefficient based on the word appearance in a given corpus
    freq_df[archetype] = freq_df[archetype].apply(lambda x: float(x) / word_cnt)
    
    bigrams = tmp_df.apply(lambda row: list(nltk.ngrams(row, 2)))
    trigrams = tmp_df.apply(lambda row: list(nltk.ngrams(row, 3)))

    bigrams_freq = nltk.FreqDist(gram for text in bigrams for gram in text)
    trigrams_freq = nltk.FreqDist(gram for text in trigrams for gram in text)

    # Sort bi- and trigrams
    sorted_bigrams = bigrams_freq.most_common(len(bigrams_freq))
    sorted_trigrams = trigrams_freq.most_common(len(trigrams_freq))
    
    # Get the unique n-gram count in the archetype
    bi_cnt = 0
    for k,v in sorted_bigrams:
        bi_cnt += v
    print(f"Bigram count in {archetype} archetype: {bi_cnt}")

    tri_cnt = 0
    for k,v in sorted_trigrams:
        tri_cnt += v
    print(f"Trigram count in {archetype} archetype: {tri_cnt}")
    
    # Create coefficients for n-grams
    bigram_freq = pd.DataFrame(sorted_bigrams, columns=["chunk", archetype])
    trigram_freq = pd.DataFrame(sorted_trigrams, columns=["chunk", archetype])

    # Scale the coefficients
    bigram_freq[archetype] = bigram_freq[archetype].apply(lambda x: float(x) / bi_cnt)
    trigram_freq[archetype] = trigram_freq[archetype].apply(lambda x: float(x) / tri_cnt)

    # Concatenate the frames
    frames = [freq_df, bigram_freq, trigram_freq]
    full_freq = pd.concat(frames, ignore_index=True)
    
    # Concatenate the new frame and the base
    frames = [full_df, full_freq]
    full_df = pd.concat(frames, ignore_index=True)

  8%|▊         | 1/12 [00:00<00:04,  2.74it/s]

Bigram count in artist archetype: 115081
Trigram count in artist archetype: 104977


 17%|█▋        | 2/12 [00:00<00:03,  3.27it/s]

Bigram count in caregiver archetype: 71474
Trigram count in caregiver archetype: 65950


 25%|██▌       | 3/12 [00:00<00:02,  3.32it/s]

Bigram count in everyman archetype: 60704
Trigram count in everyman archetype: 53405


 33%|███▎      | 4/12 [00:01<00:02,  2.71it/s]

Bigram count in explorer archetype: 113752
Trigram count in explorer archetype: 104160
Bigram count in guru archetype: 128649
Trigram count in guru archetype: 118189


 42%|████▏     | 5/12 [00:02<00:03,  2.15it/s]

Bigram count in hero archetype: 60557
Trigram count in hero archetype: 55815


 50%|█████     | 6/12 [00:02<00:03,  1.78it/s]

Bigram count in innocent archetype: 70759
Trigram count in innocent archetype: 64200


 58%|█████▊    | 7/12 [00:03<00:02,  1.82it/s]

Bigram count in jester archetype: 88435
Trigram count in jester archetype: 79303


 67%|██████▋   | 8/12 [00:03<00:02,  1.77it/s]

Bigram count in magician archetype: 110600
Trigram count in magician archetype: 100338


 75%|███████▌  | 9/12 [00:04<00:01,  1.59it/s]

Bigram count in rebel archetype: 57700
Trigram count in rebel archetype: 52993


 83%|████████▎ | 10/12 [00:05<00:01,  1.54it/s]

Bigram count in ruler archetype: 91743
Trigram count in ruler archetype: 83326


 92%|█████████▏| 11/12 [00:06<00:00,  1.43it/s]

Bigram count in seducer archetype: 46117
Trigram count in seducer archetype: 42757


100%|██████████| 12/12 [00:06<00:00,  1.74it/s]


In [138]:
# Fill all NaN with 0.0
full_df = full_df.fillna(0.0)

# Aggregate the results
aggregate_func = {
    "chunk": "first",
    "artist": "sum",
    "caregiver": "sum",
    "everyman": "sum",
    "explorer": "sum",
    "guru": "sum",
    "hero": "sum",
    "innocent": "sum",
    "jester": "sum",
    "magician": "sum",
    "rebel": "sum",
    "ruler": "sum",
    "seducer": "sum"
}

full_df = full_df.groupby("chunk").aggregate(aggregate_func)
full_df = full_df.reset_index(drop=True)

# Show the aggregated results
full_df

Unnamed: 0,chunk,artist,caregiver,everyman,explorer,guru,hero,innocent,jester,magician,rebel,ruler,seducer
0,"(aa, batteries)",0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000009,0.0,0.0,0.0
1,"(aa, came)",0.0,0.000000,0.000016,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
2,"(aa, came, rescue)",0.0,0.000000,0.000019,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
3,"(aa, gold)",0.0,0.000000,0.000000,0.000009,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
4,"(aa, gold, director)",0.0,0.000000,0.000000,0.000010,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1034655,zyi,0.0,0.000024,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
1034656,zyoom,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000016,0.0,0.0,0.0
1034657,zyra,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000008,0.0,0.0,0.0
1034658,zz,0.0,0.000000,0.000000,0.000000,0.0,0.000008,0.0,0.0,0.000000,0.0,0.0,0.0


In [139]:
# Save the file
full_df.to_csv("word_ngram_frequency.csv")