In [1]:
from spmf import Spmf
import pandas as pd
from text_cleaner import *
from tqdm import tqdm
import itertools
from tqdm import tqdm

archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

parameter_val = [
    0.0007,
    0.001,
    0.0007,
    0.0005,
    0.0004,
    0.001,
    0.0007,
    0.0005,
    0.0004,
    0.001,
    0.0005,
    0.0015
]

In [2]:
# Load the Twitter dataset
twitter_df = pd.read_csv('tweets_06_03_2021.csv', index_col=0)

# Print the head of the loaded dataset
twitter_df.head()

# Clean-up the texts
twitter_df["cleaned_text"] = twitter_df["tweet_text"].apply(lambda x: clean_up_text(x))

# Tokenize the cleaned texts
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: nltk.word_tokenize(x))

# Remove the stopwords
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: remove_stopwords(x))

# Drop the rows with empty 'cleaned_text' field
twitter_df = twitter_df.drop(twitter_df[twitter_df['cleaned_text'].map(len) < 1].index)

# Print the new head of the dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype,cleaned_text
0,5f9f1c36b38e10f823bf2cdc,"@AndruEdwards The hard work has paid off, this...",LEGO_Group,2020-11-01 19:32:05.000,,artist,"[hard, work, paid, awesome]"
1,5f9f1c36b38e10f823bf2cdd,@soosupersam A great way to surprise your love...,LEGO_Group,2020-11-01 19:09:40.000,,artist,"[great, way, surprise, loved, one]"
2,5f9f1c36b38e10f823bf2cde,"You can now just bring the fun home, and reliv...",LEGO_Group,2020-11-01 14:00:36.000,,artist,"[bring, fun, home, relive, favorite, childhood..."
3,5f9f1c36b38e10f823bf2cdf,@at_knb Happy birthday to the master builder! ...,LEGO_Group,2020-10-31 17:16:57.000,,artist,"[happy, birthday, master, builder, hope, magic..."
6,5f9f1c36b38e10f823bf2ce2,@Ranchie This is the way! 😀,LEGO_Group,2020-10-31 15:16:26.000,,artist,[way]


In [3]:
for archetype, param in zip(archetype_list, parameter_val):
    # Extract all the tweets for the archetype
    tmp_df = twitter_df.cleaned_text[twitter_df["archetype"] == archetype]

    # Reset the index of the subset
    tmp_df = tmp_df.reset_index(drop=True)

    # Print the head of the subset
    tmp_df.head()
    
    # Convert the word lists to full sentences, detokenization
    tmp_df = pd.concat([tmp_df, tmp_df.apply(lambda x: " ".join(x))], axis=1)
    tmp_df.columns.values[1] = "full_sentence"

    # Drop duplicates
    tmp_df.sort_values("full_sentence", inplace = True) 
    tmp_df.drop_duplicates(subset="full_sentence", keep=False, inplace=True)
    
    text_list = tmp_df["full_sentence"].tolist()

    spmf = Spmf("PrefixSpan", input_direct=text_list,
                    output_filename=f"sequence_files_minsup_4/output_{archetype}.txt", arguments=[param, 3], input_type="text")
    spmf.run()

    spmf = spmf.to_pandas_dataframe()

    spmf["sup"] = spmf["sup"] / len(spmf)
    print(spmf)
    spmf.to_csv(f"sequence_files_minsup_4/output_{archetype}.csv")

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 237 ms
 Frequent sequences count : 28796
 Max memory (mb) : 82.10414123535156
 minsup = 4 sequences.
 Pattern count : 28796

Post-processing to show result in terms of string values.
Post-processing completed.

                         pattern       sup
0                         [able]  0.008091
1                   [able, able]  0.000729
2           [able, able, advise]  0.000139
3             [able, able, look]  0.000139
4      [able, able, information]  0.000139
...                          ...       ...
28791                    [grabs]  0.000208
28792             [grabs, today]  0.000174
28793                 [giveaway]  0.000139
28794                  [surreal]  0.000139
28795                  [venture]  0.000139

[28796 rows x 2 columns]
>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
C

In [5]:
# Now, create an AGDS with the calculated values
import ast
agds_layer = {}

for archetype in tqdm(archetype_list):
    tmp_df = pd.read_csv(f"sequence_files_minsup_4/output_{archetype}.csv", index_col=0)
    tmp_df['pattern'] = tmp_df['pattern'].apply(lambda x: ast.literal_eval(x))
    for _, row in tmp_df.iterrows():
        
        key = tuple(row["pattern"])
        if key not in agds_layer.keys():
            agds_layer[key] = {archetype: row["sup"]}
        else:
            agds_layer[key].update({archetype: row["sup"]})

100%|██████████| 12/12 [00:48<00:00,  4.03s/it]


In [6]:
# Export the model
import pickle
with open("sequence_files_minsup_4/agds_minsup_4.pickle", "wb") as f:
    pickle.dump(agds_layer, f)

In [7]:
# Check the memory usage
import os
from sys import getsizeof
mem_csvs = sum(os.path.getsize(os.path.join('sequence_files_minsup_4', f)) for f in os.listdir('sequence_files_minsup_4') if f.endswith(".csv"))

print("AGDS representation size: ", getsizeof(agds_layer))
print("Matrix representation size: ", mem_csvs)
print("AGDS needs ~", round(mem_csvs/getsizeof(agds_layer), 2), "times less space than standard matrix representation.")

AGDS representation size:  20971608
Matrix representation size:  31407470
AGDS needs ~ 1.5 times less space than standard matrix representation.


In [8]:
# Create class for Tweet end-to-end processing
# From tokenization to class assignment
class SingleTweet(object):
    def __init__(self, text):
        self._raw_text = text
        self._cleaned_text = self._clean_text()
        self._all_terms = self._tokenize_and_permute()
        self._class_description = {
        "artist": 0.0,
        "caregiver": 0.0,
        "everyman": 0.0,
        "explorer": 0.0,
        "guru": 0.0,
        "hero": 0.0,
        "innocent": 0.0,
        "jester": 0.0,
        "magician": 0.0,
        "rebel": 0.0,
        "ruler": 0.0,
        "seducer": 0.0
        } 
        self._key_list = list(self._class_description)
        
    def __str__(self):
        return f"{self._class_description}"
        
    def _clean_text(self):
        return clean_up_text(self._raw_text)
        
    def _tokenize_and_permute(self):
        terms = []
        # Tokenize the Tweet
        words = nltk.word_tokenize(self._cleaned_text)
        terms = np.unique(words).tolist()
        
        # Get all two-word and three-word phrases
        terms.append(itertools.permutations(words, 2))
        terms.append(itertools.permutations(words, 3))
        return terms
    
    def _softmax(self, x):
        return np.exp(x) / np.sum(np.exp(x), axis=0)
    
    def classify_tweet(self, structure):
        for term in self._all_terms:
            tuple_term = tuple(term)
            if tuple_term in structure.keys():
                for k, v in structure[tuple_term].items():
                    self._class_description[k] += v
        val_list = list(self._class_description.values())
        outs = self._softmax(val_list)
        
        return self._key_list[np.argmax(outs)]

In [9]:
# Testing the implementation on Twitter set (accuracy can be verified)
twitter_df = pd.read_csv("tweets_06_03_2021.csv", index_col=0)

In [11]:
# Check the AGDS accuracy
import operator
import numpy as np

acc = 0
total_cnt = len(twitter_df)
dataset = tqdm(twitter_df.iterrows())

for _, row in dataset:
    result = SingleTweet(row.tweet_text).classify_tweet(agds_layer)
    if result == row.archetype:
        acc += 1
    dataset.set_description(f"Current accuracy: {acc / total_cnt}")

# Show the real accuracy
print(f"Accuracy of AGDS: {acc / len(twitter_df)}")

Current accuracy: 0.12117044006125462: : 109053it [07:14, 250.98it/s]

Accuracy of AGDS: 0.12117044006125462



