In [2]:
# Try to get most occurring sequences without SPMF
import pandas as pd
from text_cleaner import *
from tqdm import tqdm
import itertools
from spmf import Spmf

archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

In [3]:
# Load the Twitter dataset
twitter_df = pd.read_csv('tweets_06_03_2021.csv', index_col=0)
print(f"Length before removing duplicates: {len(twitter_df)}")

# Print the head of the loaded dataset
twitter_df.head()

# Drop duplicates
twitter_df.sort_values("tweet_text", inplace = True) 
twitter_df.drop_duplicates(subset="tweet_text", keep=False, inplace=True)
print(f"Length after removing duplicates: {len(twitter_df)}")

Length before removing duplicates: 109053
Length after removing duplicates: 81019


In [4]:
# Clean-up the texts
twitter_df["cleaned_text"] = twitter_df["tweet_text"].apply(lambda x: clean_up_text(x))

# Tokenize the cleaned texts
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: nltk.word_tokenize(x))

# Remove the stopwords
twitter_df["cleaned_text"] = twitter_df["cleaned_text"].apply(lambda x: remove_stopwords(x))

# Drop the rows with empty 'cleaned_text' field
twitter_df = twitter_df.drop(twitter_df[twitter_df['cleaned_text'].map(len) < 1].index)

# Print the new head of the dataset
twitter_df.head()

Unnamed: 0,_id,tweet_text,username,created_at,timestamp,archetype,cleaned_text
68217,5fcaa649345223d8055d9d1d,"""#Amazon is using its immense power &amp; reso...",ProgIntl,2020-12-01 10:14:58.000,,jester,"[amazon, using, immense, power, amp, resources..."
37901,6026de945dc094fc123d13a7,"""...we went on an 8-month road trip all across...",subaru_usa,2021-02-08 23:29:02.000,,explorer,"[went, month, road, trip, across, country, vis..."
30830,5fc1581614aa7245901a9ce3,"""3.. 2.. 1.. 0.. and liftoff of Sentinel-6 Mic...",NASA,2020-11-21 17:19:21.000,,explorer,"[liftoff, sentinel, michael, freilich, continu..."
38240,603018a841c201da0e67e5a1,"""@NASAPersevere is still in space right now, a...",NASA,2021-02-18 19:24:45.000,,explorer,"[still, space, right, miles, mars, far, health..."
36002,6013cc55a147999037fb2ab1,"""A Beagle mix called Lisa has brought me compa...",PBS,2021-01-25 18:00:29.000,,explorer,"[beagle, mix, called, lisa, brought, companion..."


In [5]:
# Example 1 - try to get the most occurring words in the 'artist' archetype subset
# Extract all the tweets for the 'artist' archetype
artist_df = twitter_df.cleaned_text[twitter_df["archetype"] == "artist"]

# Reset the index of the subset
artist_df = artist_df.reset_index(drop=True)

# Print the head of the subset
artist_df.head()

0    [banda, music, restrictions, go, direction, us...
1    [black, creatives, everywhere, every, form, cr...
2    [creativity, uniquely, human, trait, algorithm...
3    [think, someone, else, would, think, would, sa...
4    [good, see, cloud, one, winged, angel, sephiro...
Name: cleaned_text, dtype: object

In [6]:
# Create a list of sentences
artist_list = [" ".join(row) for row in artist_df.tolist()]

# SPMF - get the most frequent sequences
spmf = Spmf("PrefixSpan", input_direct=artist_list,
            output_filename="output.txt", arguments=[0.00005, 3], input_type="text")
spmf.run()
print(spmf.to_pandas_dataframe(pickle=True))
spmf.to_csv("output.csv")

>/mnt/HDD_Linux/Praca_magisterska/jupyter_notebooks/data_mining/spmf.jar
Converting TEXT to SPMF format.
Conversion completed.
 Total time ~ 1052 ms
 Frequent sequences count : 2918736
 Max memory (mb) : 188.3369140625
 minsup = 1 sequences.
 Pattern count : 2918736

Post-processing to show result in terms of string values.
Post-processing completed.

                              pattern  sup
0                             [banda]    1
1                      [banda, music]    1
2        [banda, music, restrictions]    1
3                  [banda, music, go]    1
4           [banda, music, direction]    1
...                               ...  ...
2918731     [magnet, shipping, costs]    1
2918732       [magnet, shipping, get]    1
2918733    [magnet, shipping, reward]    1
2918734              [magnet, reward]    1
2918735         [magnet, reward, get]    1

[2918736 rows x 2 columns]


In [21]:
spmf_df = spmf.to_pandas_dataframe()
import numpy as np

three_w = spmf_df.loc[np.array(list(map(len,spmf_df.pattern.values))) == 3]
print(three_w["sup"].sum())

two_w = spmf_df.loc[np.array(list(map(len,spmf_df.pattern.values))) == 2]
print(two_w["sup"].sum())

one_w = spmf_df.loc[np.array(list(map(len,spmf_df.pattern.values))) == 1]
print(one_w["sup"].sum())

407119
257030
115500


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
spmf_df["pattern"] = spmf_df["pattern"].apply(lambda x: tuple(x))

vectorizer = TfidfVectorizer(vocabulary=spmf_df["pattern"])

In [34]:
transformed_list = vectorizer.fit_transform(artist_list)

In [35]:
print(transformed_list)


