In [2]:
# Different approach - use only document frequency as importance coefficient
# SPMF RETURS DOCUMENT FREQUENCY!!!!!!!!!
from spmf import Spmf
import pandas as pd
from text_cleaner import *
from tqdm import tqdm
import itertools

archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

In [3]:
# Load the Twitter dataset
twitter_df = pd.read_csv('tweets_06_03_2021.csv', index_col=0)

# Print the head of the loaded dataset
twitter_df.head()

# Clean-up the texts
twitter_df["cleaned_text"] = twitter_df["tweet_text"].apply(lambda x: clean_up_text(x))

# Tokenize the cleaned texts
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: nltk.word_tokenize(x))

# Remove the stopwords
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: remove_stopwords(x))

# Drop the rows with empty 'cleaned_text' field
twitter_df = twitter_df.drop(twitter_df[twitter_df['cleaned_text'].map(len) < 1].index)

# Print the new head of the dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype,cleaned_text
0,5f9f1c36b38e10f823bf2cdc,"@AndruEdwards The hard work has paid off, this...",LEGO_Group,2020-11-01 19:32:05.000,,artist,"[hard, work, paid, awesome]"
1,5f9f1c36b38e10f823bf2cdd,@soosupersam A great way to surprise your love...,LEGO_Group,2020-11-01 19:09:40.000,,artist,"[great, way, surprise, loved, one]"
2,5f9f1c36b38e10f823bf2cde,"You can now just bring the fun home, and reliv...",LEGO_Group,2020-11-01 14:00:36.000,,artist,"[bring, fun, home, relive, favorite, childhood..."
3,5f9f1c36b38e10f823bf2cdf,@at_knb Happy birthday to the master builder! ...,LEGO_Group,2020-10-31 17:16:57.000,,artist,"[happy, birthday, master, builder, hope, magic..."
6,5f9f1c36b38e10f823bf2ce2,@Ranchie This is the way! 😀,LEGO_Group,2020-10-31 15:16:26.000,,artist,[way]


In [4]:
# Example 1 - try to get the most occurring words in the 'artist' archetype subset
# Extract all the tweets for the 'artist' archetype
artist_df = twitter_df.cleaned_text[twitter_df["archetype"] == "artist"]

# Reset the index of the subset
artist_df = artist_df.reset_index(drop=True)

# Print the head of the subset
artist_df.head()

0                          [hard, work, paid, awesome]
1                   [great, way, surprise, loved, one]
2    [bring, fun, home, relive, favorite, childhood...
3    [happy, birthday, master, builder, hope, magic...
4                                                [way]
Name: cleaned_text, dtype: object

In [5]:
# Convert the word lists to full sentences, detokenization
artist_df = pd.concat([artist_df, artist_df.apply(lambda x: " ".join(x))], axis=1)
artist_df.columns.values[1] = "full_sentence"

# Drop duplicates
artist_df.sort_values("full_sentence", inplace = True) 
artist_df.drop_duplicates(subset="full_sentence", keep=False, inplace=True)

In [254]:
# Archetype vocabulary - get the vocabulary based on SPMF results (it' fast)
text_list = artist_df["full_sentence"].tolist()

spmf = Spmf("PrefixSpan", input_direct=text_list,
                output_filename=f"sequence_files_for_tfidf/output_artist.txt", arguments=[0.0007, 3], input_type="text")
spmf.run()

spmf = spmf.to_pandas_dataframe()

spmf["sup"] = spmf["sup"] / len(spmf)
print(spmf)
spmf.to_csv(f"sequence_files/output_artist.csv")

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 235 ms
 Frequent sequences count : 28796
 Max memory (mb) : 82.09123229980469
 minsup = 4 sequences.
 Pattern count : 28796

Post-processing to show result in terms of string values.
Post-processing completed.

                         pattern       sup
0                         [able]  0.008091
1                   [able, able]  0.000729
2           [able, able, advise]  0.000139
3             [able, able, look]  0.000139
4      [able, able, information]  0.000139
...                          ...       ...
28791                    [grabs]  0.000208
28792             [grabs, today]  0.000174
28793                 [giveaway]  0.000139
28794                  [surreal]  0.000139
28795                  [venture]  0.000139

[28796 rows x 2 columns]


In [116]:
phrase_list = []
for _, row in spmf.iterrows():
    phrase_list.append(tuple(row["pattern"]))

In [255]:
# Calculate Bag of Words

from collections import Counter
from tqdm import tqdm
import nltk

# Get term sequency per sentence
def get_bow(sen, vocab):

    vector = [0] * len(vocab)
    tokenized_sentence = nltk.word_tokenize(sen)
    combined_sentence = list(itertools.chain.from_iterable([itertools.combinations(tokenized_sentence, 1),
                                                   itertools.combinations(tokenized_sentence, 2),
                                                   itertools.combinations(tokenized_sentence, 3)]))
    for el in combined_sentence:
        if el in vocab:
            cnt = combined_sentence.count(el)
            idx = vocab.index(el)
            vector[idx] = cnt
    return vector

sentence_vectors = []
for sentence in tqdm(text_list):
    sent_vec = get_bow
    sentence_vectors.append(get_bow(sentence, phrase_list))

  0%|          | 0/5612 [00:00<?, ?it/s]


TypeError: 'str' object is not callable

In [211]:
# Save BoW to file
sentence_vectors = np.asarray(sentence_vectors)
pd.DataFrame(sentence_vectors).to_csv("artist_bow.csv")

In [212]:
# Create TF-IDF-like metric out of Bag-of-Words
artist_tfidf = pd.DataFrame(sentence_vectors)

In [213]:
artist_tfidf.columns = phrase_list
artist_tfidf.head()

Unnamed: 0,"(able,)","(able, able)","(able, able, advise)","(able, able, look)","(able, able, information)","(able, able, provide)","(able, add)","(able, printer)","(able, still)","(able, working)",...,"(avez, vous)","(sit,)","(wonderland,)","(splatoween,)","(spotted,)","(grabs,)","(grabs, today)","(giveaway,)","(surreal,)","(venture,)"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
# Get sum of all nonzeros in all columns
term_freq = artist_tfidf.astype(int).sum(axis=0)
doc_freq = artist_tfidf.astype(bool).sum(axis=0)

In [92]:
cols = artist_tfidf.columns
print(f"DF for first term: {np.unique(doc_freq[cols[0]])}")

DF for first term: [5379]


In [93]:
print(np.unique(term_freq[cols[0]]))

[5379]


In [94]:
# Combine tf and doc frequencies to get our metrics
import math
metric = lambda tf, docf: (float(tf / len(cols)) * math.log(len(artist_df) / (docf + 1)))
metric_df = term_freq.combine(doc_freq, metric)

In [95]:
metric_df.head()

(able,)                      0.007886
(able, able)                 0.000693
(able, able, advise)         0.000243
(able, able, look)           0.000243
(able, able, information)    0.000243
dtype: float64

In [244]:
# CountVectorizer - get the term frequency
from sklearn.feature_extraction.text import CountVectorizer

class DenseCount(CountVectorizer):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)

    def transform(self, x, y=None) -> pd.DataFrame:
        res = super().transform(x)
        df = pd.DataFrame(res.toarray(), columns=self.get_feature_names())
        return df

    def fit_transform(self, x, y=None) -> pd.DataFrame:
        # run sklearn's fit_transform
        res = super().fit_transform(x, y=y)
        # convert the returned sparse documents-terms matrix into a dataframe to further manipulations
        df = pd.DataFrame(res.toarray(), columns=self.get_feature_names())
        return df

In [252]:
phrase_v2 = [' '.join(el).lstrip(' ').rstrip(' ') for el in phrase_list]
tf_docs_terms = DenseCount(vocabulary=phrase_v2).fit_transform(text_list)

In [253]:
tf_docs_terms.head()

Unnamed: 0,able,able able,able able advise,able able look,able able information,able able provide,able add,able printer,able still,able working,...,avez vous,sit,wonderland,splatoween,spotted,grabs,grabs today,giveaway,surreal,venture
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [125]:
sample_text = text_list[0]
print(sample_text)

able add printer mac still working advise calling support team mon fri excluding bank holidays would able look


In [127]:
sample_text.split(" ").count(('able', 'able'))

0

In [202]:
# Get term sequency per sentence
def get_bow(sen, vocab):

    vector = [0] * len(vocab)
    tokenized_sentence = nltk.word_tokenize(sen)
    combined_sentence = list(itertools.chain.from_iterable([itertools.combinations(tokenized_sentence, 1),
                                                   itertools.combinations(tokenized_sentence, 2),
                                                   itertools.combinations(tokenized_sentence, 3)]))
    for el in combined_sentence:
        if el in vocab:
            cnt = combined_sentence.count(el)
            idx = vocab.index(el)
            vector[idx] += cnt
    return vector

sentence_vectors = []
for sentence in tqdm(text_list):
    sent_vec = get_bow
    sentence_vectors.append(get_bow(sentence, phrase_list))

In [203]:
pats = spmf_test["pattern"].tolist()
pats = [tuple(pat) for pat in pats]
print
ress = get_bow('aa bb cc aa', pats)

[('aa',), ('aa', 'aa'), ('aa', 'bb'), ('aa', 'bb', 'aa'), ('aa', 'bb', 'cc'), ('aa', 'cc'), ('aa', 'cc', 'aa'), ('bb',), ('bb', 'aa'), ('bb', 'cc'), ('bb', 'cc', 'aa'), ('cc',), ('cc', 'aa')]
13
13
Count of ('aa',), with index 0: 2
Count of ('bb',), with index 7: 1
Count of ('cc',), with index 11: 1
Count of ('aa',), with index 0: 2
Count of ('aa', 'bb'), with index 2: 1
Count of ('aa', 'cc'), with index 5: 1
Count of ('aa', 'aa'), with index 1: 1
Count of ('bb', 'cc'), with index 9: 1
Count of ('bb', 'aa'), with index 8: 1
Count of ('cc', 'aa'), with index 12: 1
Count of ('aa', 'bb', 'cc'), with index 4: 1
Count of ('aa', 'bb', 'aa'), with index 3: 1
Count of ('aa', 'cc', 'aa'), with index 6: 1
Count of ('bb', 'cc', 'aa'), with index 10: 1


In [180]:
print(pats)
print(ress)

[('aa',), ('aa', 'aa'), ('aa', 'bb'), ('aa', 'bb', 'aa'), ('aa', 'bb', 'cc'), ('aa', 'cc'), ('aa', 'cc', 'aa'), ('bb',), ('bb', 'aa'), ('bb', 'cc'), ('bb', 'cc', 'aa'), ('cc',), ('cc', 'aa')]
   (aa,)  (aa, aa)  (aa, bb)  (aa, bb, aa)  (aa, bb, cc)  (aa, cc)  \
0      0         0         0             0             0         0   

   (aa, cc, aa)  (bb,)  (bb, aa)  (bb, cc)  (bb, cc, aa)  (cc,)  (cc, aa)  
0             0      0         0         0             0      0         0  


In [267]:
print(phrase_list[:500:100])

[('able',), ('able', 'us', 'software'), ('able', 'back'), ('printer', 'holidays'), ('printer', 'information')]


In [266]:
print(text_list[:50:10])

['able add printer mac still working advise calling support team mon fri excluding bank holidays would able look', 'absolutely one cat coyote peterson', 'accurate customs checks cause delays also causing issues expected delivery dates changing', 'add super mario flair fridge desk supermario dworld bowsersfury magnet set available platinum points shipping costs mynintendo reward get', 'additional information though pass comments team thanks']
