In [2]:
# Try to get most occurring sequences without SPMF
import pandas as pd
from text_cleaner import *
from tqdm import tqdm
import itertools

archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

In [2]:
# Load the Twitter dataset
twitter_df = pd.read_csv('tweets_06_03_2021.csv', index_col=0)

# Print the head of the loaded dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype
0,5f9f1c36b38e10f823bf2cdc,"@AndruEdwards The hard work has paid off, this...",LEGO_Group,2020-11-01 19:32:05.000,,artist
1,5f9f1c36b38e10f823bf2cdd,@soosupersam A great way to surprise your love...,LEGO_Group,2020-11-01 19:09:40.000,,artist
2,5f9f1c36b38e10f823bf2cde,"You can now just bring the fun home, and reliv...",LEGO_Group,2020-11-01 14:00:36.000,,artist
3,5f9f1c36b38e10f823bf2cdf,@at_knb Happy birthday to the master builder! ...,LEGO_Group,2020-10-31 17:16:57.000,,artist
4,5f9f1c36b38e10f823bf2ce0,@dizunatsu 😀😀,LEGO_Group,2020-10-31 15:18:50.000,,artist


In [3]:
# Clean-up the texts
twitter_df["cleaned_text"] = twitter_df["tweet_text"].apply(lambda x: clean_up_text(x))

# Tokenize the cleaned texts
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: nltk.word_tokenize(x))

# Remove the stopwords
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: remove_stopwords(x))

# Drop the rows with empty 'cleaned_text' field
twitter_df = twitter_df.drop(twitter_df[twitter_df['cleaned_text'].map(len) < 1].index)

# Print the new head of the dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype,cleaned_text
0,5f9f1c36b38e10f823bf2cdc,"@AndruEdwards The hard work has paid off, this...",LEGO_Group,2020-11-01 19:32:05.000,,artist,"[hard, work, paid, awesome]"
1,5f9f1c36b38e10f823bf2cdd,@soosupersam A great way to surprise your love...,LEGO_Group,2020-11-01 19:09:40.000,,artist,"[great, way, surprise, loved, one]"
2,5f9f1c36b38e10f823bf2cde,"You can now just bring the fun home, and reliv...",LEGO_Group,2020-11-01 14:00:36.000,,artist,"[bring, fun, home, relive, favorite, childhood..."
3,5f9f1c36b38e10f823bf2cdf,@at_knb Happy birthday to the master builder! ...,LEGO_Group,2020-10-31 17:16:57.000,,artist,"[happy, birthday, master, builder, hope, magic..."
6,5f9f1c36b38e10f823bf2ce2,@Ranchie This is the way! 😀,LEGO_Group,2020-10-31 15:16:26.000,,artist,[way]


In [4]:
# Get all possible phrases, varying in length from 1 to 3
cleaned = twitter_df.cleaned_text
cleaned.to_csv("cleaned_text.csv")

In [15]:
# Read the file and chunkize it
import ast

words = set()
two_word_phrases = set()
three_word_phrases = set()

# Iterate over all the tweets to get all possible combinations
chunks = pd.read_csv("cleaned_text.csv", sep=",", index_col=0, chunksize=1000)

results = []
orphans = pd.DataFrame()

for chunk in tqdm(chunks):
    chunk["cleaned_text"] = chunk["cleaned_text"].apply(lambda x: ast.literal_eval(x))
    
    chunk = pd.concat((orphans, chunk))
    last_val = chunk["cleaned_text"].iloc[-1]
    is_orphan = chunk["cleaned_text"] == tuple(last_val)
    
    chunk, orphans = chunk[~is_orphan], chunk[is_orphan]
    
    for _, value in chunk.iteritems():
        print(value)
        words.update(set(pvalue))
        two_word_phrases.update(set(itertools.permutations(value, 2)))
        three_word_phrases.update(set(itertools.permutations(value, 3)))

if len(orphans):
    for _, value in orphans.iteritems():
        words.update(set(pvalue))
        two_word_phrases.update(set(itertools.permutations(value, 2)))
        three_word_phrases.update(set(itertools.permutations(value, 3)))


0it [00:00, ?it/s]

0                             [hard, work, paid, awesome]
1                      [great, way, surprise, loved, one]
2       [bring, fun, home, relive, favorite, childhood...
3       [happy, birthday, master, builder, hope, magic...
6                                                   [way]
                              ...                        
1040                                           [diligent]
1041          [jobski, appreciation, thread, parttimeufo]
1042              [reminder, pm, midnight, acnhhalloween]
1043    [walking, island, residents, energy, tonight, ...
1044    [ruined, king, league, legends, story, single,...
Name: cleaned_text, Length: 1000, dtype: object





NameError: name 'phrase' is not defined

In [None]:
# Print length of sets
print(f"Number of unique words: {len(words)}")
print(f"Number of unique 2-words: {len(two_word_phrases)}")
print(f"Number of unique 3-words: {len(three_word_phrases)}")