In [1]:
# Creating an AGDS after mining all the data from the current dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
from text_cleaner import *
import nltk
import itertools

In [2]:
# Read the phrase frequency file
freq_df = pd.read_csv("sequence_files_bak/phrase_frequency_with_df.csv", index_col=0)

# Print the head of the DataFrame
freq_df.head()

Unnamed: 0,pattern,artist,caregiver,everyman,explorer,guru,hero,innocent,jester,magician,rebel,ruler,seducer
0,['aaron'],0.0,0.0,0.0,0.009045,0.0,0.00751,0.0,0.0,0.0,0.0,0.0,0.0
1,['abbie'],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041686,0.0,0.0,0.0
2,['ability'],0.0,0.0,0.0,0.012334,0.009126,0.020027,0.0,0.0,0.0,0.011175,0.0,0.0
3,"['able', 'able']",0.000486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"['able', 'access']",0.0,0.0,0.0,0.0,0.0,0.000136,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Create dictionary that links the terms with a given archetype
term_to_archetype = {}

for i, row in freq_df.iterrows():
    
    term_to_archetype[tuple(row["pattern"])] = {
        "artist": row["artist"],
        "caregiver": row["caregiver"],
        "everyman": row["everyman"],
        "explorer": row["explorer"],
        "guru": row["guru"],
        "hero": row["hero"],
        "innocent": row["innocent"],
        "jester": row["jester"],
        "magician": row["magician"],
        "rebel": row["rebel"],
        "ruler": row["ruler"],
        "seducer": row["seducer"]
    } 

In [4]:
# Check if AGDS is more memory-efficient than a standard DataFrame
from sys import getsizeof
print("AGDS representation size: ", getsizeof(term_to_archetype))
print("Matrix representation size: ", getsizeof(freq_df))
print("AGDS needs ~", round(getsizeof(freq_df)/getsizeof(term_to_archetype), 2), "times less space than standard matrix representation.")

AGDS representation size:  10485856
Matrix representation size:  57380352
AGDS needs ~ 5.47 times less space than standard matrix representation.


In [38]:
# Create class for Tweet end-to-end processing
# From tokenization to class assignment
class SingleTweet(object):
    def __init__(self, text):
        self._raw_text = text
        self._cleaned_text = self._clean_text()
        self._all_terms = self._tokenize_and_permute()
        self._class_description = {
        "artist": 0.0,
        "caregiver": 0.0,
        "everyman": 0.0,
        "explorer": 0.0,
        "guru": 0.0,
        "hero": 0.0,
        "innocent": 0.0,
        "jester": 0.0,
        "magician": 0.0,
        "rebel": 0.0,
        "ruler": 0.0,
        "seducer": 0.0
        } 
        self._key_list = list(self._class_description)
        
    def __str__(self):
        return f"{self._class_description}"
        
    def _clean_text(self):
        return clean_up_text(self._raw_text)
        
    def _tokenize_and_permute(self):
        terms = []
        # Tokenize the Tweet
        words = nltk.word_tokenize(self._cleaned_text)
        terms = np.unique(words).tolist()
        
        # Get all two-word and three-word phrases
        terms.append(itertools.permutations(words, 2))
        terms.append(itertools.permutations(words, 3))
        return terms
    
    def _softmax(self, x):
        return np.exp(x) / np.sum(np.exp(x), axis=0)
    
    def classify_tweet(self, structure):
        for term in self._all_terms:
            tuple_term = tuple(term)
            if tuple_term in structure.keys():
                for k, v in structure[tuple_term]:
                    self._class_description[k] += v
        val_list = list(self._class_description.values())
        outs = self._softmax(val_list)
        
        return self._key_list[np.argmax(outs)]

In [39]:
# Testing the implementation on Twitter set (accuracy can be verified)
twitter_df = pd.read_csv("tweets_06_03_2021.csv", index_col=0)

# Select a sample Tweet
sample_text = twitter_df.iloc[4].tweet_text
print(sample_text)

@dizunatsu 😀😀


In [40]:
# Create instance of SingleTweet
tweet = SingleTweet(sample_text)

# Classify the tweet
res = tweet.classify_tweet(term_to_archetype)
print(res)

TypeError: _softmax() takes 1 positional argument but 2 were given

In [15]:
# Check the AGDS accuracy
import operator

acc = 0
for _, row in tqdm(twitter_df.iterrows()):
    result = SingleTweet(row.tweet_text).classify_tweet(term_to_archetype)
    pred = max(result.items(), key=operator.itemgetter(1))[0]
    if pred == row.archetype:
        acc += 1

# Show the real accuracy
print(f"Accuracy of AGDS: {acc / len(twitter_df)}")

109053it [05:35, 325.33it/s]

Accuracy of AGDS: 0.11295425160243185





In [None]:
# Show the real accuracy
print(f"Accuracy of AGDS: {acc / len(twitter_df)}")