In [1]:
# Weighted AGDS
# Maciej Wójcik

# Dependencies
import yaml
import nltk
import gensim
from pymongo import MongoClient
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pandas as pd
np.random.seed(400)

### FUNCTION DEFINITIONS

# Create a stemmer
stemmer = SnowballStemmer("english")

# Functions for stemming and lemmatization
def stem_and_lemmatize(text:str) -> str:
    """Stems and lemmatizes a given text."""
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess_texts(text_list: pd.DataFrame) -> pd.DataFrame:
    """Processes text to remove all unwanted words and symbols."""

    # Lowercase the tweets
    text_list['processed_tweet'] = text_list['tweet_text'].str.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    text_list['processed_tweet'] = [re.sub(url_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove username from the tweet text
    text_list['processed_tweet'] = [re.sub(user_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove all non-alphanumeric symbols
    text_list['processed_tweet'] = [re.sub(alpha_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Replace all 3 or more consecutive letters with 2 letters
    text_list['processed_tweet'] = [re.sub(sequence_pattern, seq_replace_pattern, str(x))
                                    for x in text_list['processed_tweet']]

    
    full_tweet_list = []
    for x in text_list['processed_tweet']:
        full_tweet = ''
        for word in x.split():
            word = stem_and_lemmatize(word)
            full_tweet += (word + ' ')
        full_tweet_list.append(full_tweet)

    text_list['processed_tweet'] = full_tweet_list

    return text_list

def preprocess_single_tweet(text: str) -> list:
    # Lowercase the tweets
    lc_text = text.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    lc_text = re.sub(url_pattern, ' ', lc_text)

    # Remove username from the tweet text
    lc_text = re.sub(user_pattern, ' ', lc_text)

    # Remove all non-alphanumeric symbols
    lc_text = re.sub(alpha_pattern, ' ', lc_text)

    # Replace all 3 or more consecutive letters with 2 letters
    lc_text = re.sub(sequence_pattern, seq_replace_pattern, lc_text)

    processed_text = []
    for word in lc_text.split():
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 2:
            word = stem_and_lemmatize(word)
            processed_text.append(word)
    return processed_text

# DB connector
def mongo_connect(server_name: str) -> MongoClient:
    """Creates connection to the MongoDB database with given server name."""
    client = MongoClient(server_name)
    db = client.twitter_db
    return db

# Credential loader
def load_db_credentials(file_path: str) -> (str, str):
    """Loads username and password from YAML file."""
    with open(file_path) as f:
        key_data = yaml.safe_load(f)
        username = key_data['mongo-db']['username']
        passwd = key_data['mongo-db']['passwd']
    return (username, passwd)

In [2]:
# Extract data from MongoDB
# Load credentials
username, passwd = load_db_credentials('../../auth/read_only.yaml')

# Connect user to MongoDB database
db = mongo_connect(f"mongodb+srv://{username}:{passwd}@tweetdb.kpcmn.mongodb.net/twitter_db?retryWrites=true&w=majority")

# Dataframe for all Tweets
df_tweets = pd.DataFrame(columns=['_id',
                                  'tweet_text',
                                  'username',
                                  'created_at'])

# List of archetypes
#TODO: Migrate list to single file
archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

# Get all tweets from the database
for archetype in archetype_list:
    # Create a cursor for acquiring all posts from the collection
    cursor = db[archetype].find()
    
    df_archetype = pd.DataFrame(list(cursor))
    df_archetype['archetype'] = archetype
    df_tweets = df_tweets.append(df_archetype, ignore_index=True)

print(df_tweets.head())

                        _id  \
0  5f9f1c36b38e10f823bf2cdc   
1  5f9f1c36b38e10f823bf2cdd   
2  5f9f1c36b38e10f823bf2cde   
3  5f9f1c36b38e10f823bf2cdf   
4  5f9f1c36b38e10f823bf2ce0   

                                          tweet_text    username  \
0  @AndruEdwards The hard work has paid off, this...  LEGO_Group   
1  @soosupersam A great way to surprise your love...  LEGO_Group   
2  You can now just bring the fun home, and reliv...  LEGO_Group   
3  @at_knb Happy birthday to the master builder! ...  LEGO_Group   
4                                      @dizunatsu 😀😀  LEGO_Group   

           created_at timestamp archetype  
0 2020-11-01 19:32:05       NaT    artist  
1 2020-11-01 19:09:40       NaT    artist  
2 2020-11-01 14:00:36       NaT    artist  
3 2020-10-31 17:16:57       NaT    artist  
4 2020-10-31 15:18:50       NaT    artist  


In [3]:
# Preprocess tweets
df_tweets = preprocess_texts(df_tweets)

In [4]:
# Tokenize the words of each Tweet
df_tweets['processed_tweet'] = df_tweets['processed_tweet'].apply(lambda x: nltk.word_tokenize(x))

# Print the processed dataframe
print(df_tweets.head())

                        _id  \
0  5f9f1c36b38e10f823bf2cdc   
1  5f9f1c36b38e10f823bf2cdd   
2  5f9f1c36b38e10f823bf2cde   
3  5f9f1c36b38e10f823bf2cdf   
4  5f9f1c36b38e10f823bf2ce0   

                                          tweet_text    username  \
0  @AndruEdwards The hard work has paid off, this...  LEGO_Group   
1  @soosupersam A great way to surprise your love...  LEGO_Group   
2  You can now just bring the fun home, and reliv...  LEGO_Group   
3  @at_knb Happy birthday to the master builder! ...  LEGO_Group   
4                                      @dizunatsu 😀😀  LEGO_Group   

           created_at timestamp archetype  \
0 2020-11-01 19:32:05       NaT    artist   
1 2020-11-01 19:09:40       NaT    artist   
2 2020-11-01 14:00:36       NaT    artist   
3 2020-10-31 17:16:57       NaT    artist   
4 2020-10-31 15:18:50       NaT    artist   

                                     processed_tweet  
0  [the, hard, work, have, pay, off, this, be, aw...  
1      [a, great, way, 

In [5]:
# Drop tweets that have no words after processing
df_tweets = df_tweets.drop(df_tweets[df_tweets['processed_tweet'].map(len) < 2].index)

In [6]:
# Print the processed dataframe (after dropping empty tweets)
df_tweets = df_tweets.reset_index(drop=True)
print(df_tweets.head())

                        _id  \
0  5f9f1c36b38e10f823bf2cdc   
1  5f9f1c36b38e10f823bf2cdd   
2  5f9f1c36b38e10f823bf2cde   
3  5f9f1c36b38e10f823bf2cdf   
4  5f9f1c36b38e10f823bf2ce2   

                                          tweet_text    username  \
0  @AndruEdwards The hard work has paid off, this...  LEGO_Group   
1  @soosupersam A great way to surprise your love...  LEGO_Group   
2  You can now just bring the fun home, and reliv...  LEGO_Group   
3  @at_knb Happy birthday to the master builder! ...  LEGO_Group   
4                        @Ranchie This is the way! 😀  LEGO_Group   

           created_at timestamp archetype  \
0 2020-11-01 19:32:05       NaT    artist   
1 2020-11-01 19:09:40       NaT    artist   
2 2020-11-01 14:00:36       NaT    artist   
3 2020-10-31 17:16:57       NaT    artist   
4 2020-10-31 15:16:26       NaT    artist   

                                     processed_tweet  
0  [the, hard, work, have, pay, off, this, be, aw...  
1      [a, great, way, 

In [7]:
# TF-IDF Vectorizer - trying to assign weights
from sklearn.feature_extraction.text import TfidfVectorizer

def fcn_stub(stub):
    return stub

vectorizer = TfidfVectorizer(input="content", max_features=None, tokenizer=fcn_stub, preprocessor=fcn_stub)
vectorizer.fit(df_tweets['processed_tweet'])
df_tfidf = vectorizer.transform(df_tweets['processed_tweet'])
df_tfidf = df_tfidf.toarray()



In [8]:
# Get feature names
vocab = vectorizer.get_feature_names()
vocab = np.array(vocab)

df_tfidf = pd.DataFrame(df_tfidf, columns=vocab)
df_tfidf.head()

Unnamed: 0,a,aa,aadarsha,aah,aan,aanhoudt,aankoop,aankoopbewij,aankopen,aarchi,...,zv,zwanesh,zwart,zwei,zwift,zy,zyciora,zyoom,zyra,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.177691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.115838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Create dictionaries - each layer with weighted edges
AGDS_word_tweet = {}
AGDS_tweet_class = {}

# Populate layers
for idx, tweet in df_tweets.iterrows():
    if tweet['archetype'] not in AGDS_tweet_class:
        AGDS_tweet_class[tweet['archetype']] = [idx]
    else:
        AGDS_tweet_class[tweet['archetype']].append(idx)
    
    unique_words = np.unique(tweet['processed_tweet'])
    for word in unique_words:
        if word not in AGDS_word_tweet.keys():
            AGDS_word_tweet[word] = {idx: df_tfidf.iloc[idx][word]}
        else:
            try:
                AGDS_word_tweet[word].update({idx: df_tfidf.iloc[idx][word]})
            except:
                print(f"Error on index {idx}")

In [26]:
from sys import getsizeof
print("AGDS representation size: ", getsizeof(AGDS_word_tweet)+getsizeof(AGDS_tweet_class))
print("Matrix representation size: ", getsizeof(df_tweets['processed_tweet']))
print("AGDS needs ~", round(getsizeof(
    df_tweets['processed_tweet'])/(getsizeof(AGDS_word_tweet)+getsizeof(AGDS_tweet_class)), 2), 
      "times less space than standard matrix representation.")

AGDS representation size:  1311448
Matrix representation size:  19459968
AGDS needs ~ 14.84 times less space than standard matrix representation.


In [34]:
# TF-IDF as a measurement of similarity
def get_similarity(tweet_text):
    # Create dictionary with a sum of weights for every archetype
    archetype_weights = {}
    for arch in archetype_list:
        archetype_weights[arch] = 0.0
    weight_sum = 0.0
    
    # Process and tokenize the text
    processed_text = preprocess_single_tweet(tweet_text)
    
    # Iterate over graph
    for word in processed_text:
        if word in AGDS_word_tweet.keys():
            for k, v in AGDS_word_tweet[word].items():
                for key_d, val_d in AGDS_tweet_class.items():
                    if k in val_d:
                        archetype_weights[key_d] += v
                        weight_sum += v
                        break
    
    # Normalize weights
    for arch in archetype_list:
        archetype_weights[arch] /= weight_sum
    
    import operator
    max_arch = max(archetype_weights.items(), key=operator.itemgetter(1))[0]
    
    return (max_arch, archetype_weights[max_arch])

In [35]:
print(f"Real archetype: {df_tweets.iloc[1]['archetype']} - classification: {get_similarity(df_tweets.iloc[1]['tweet_text'])}")

Real archetype: artist - classification: ('artist', 0.16662292317551186)
