In [15]:
# AGDS - version 1

# Dependencies
import yaml
import nltk
import gensim
from pymongo import MongoClient
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pandas as pd
np.random.seed(400)

In [16]:
### FUNCTION DEFINITIONS

# Create a stemmer
stemmer = SnowballStemmer("english")

# Functions for stemming and lemmatization
def stem_and_lemmatize(text:str) -> str:
    """Stems and lemmatizes a given text."""
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess_texts(text_list: pd.DataFrame) -> pd.DataFrame:
    """Processes text to remove all unwanted words and symbols."""

    # Lowercase the tweets
    text_list['processed_tweet'] = text_list['tweet_text'].str.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    text_list['processed_tweet'] = [re.sub(url_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove username from the tweet text
    text_list['processed_tweet'] = [re.sub(user_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove all non-alphanumeric symbols
    text_list['processed_tweet'] = [re.sub(alpha_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Replace all 3 or more consecutive letters with 2 letters
    text_list['processed_tweet'] = [re.sub(sequence_pattern, seq_replace_pattern, str(x))
                                    for x in text_list['processed_tweet']]

    
    full_tweet_list = []
    for x in text_list['processed_tweet']:
        full_tweet = ''
        for word in x.split():
            word = stem_and_lemmatize(word)
            full_tweet += (word + ' ')
        full_tweet_list.append(full_tweet)

    text_list['processed_tweet'] = full_tweet_list

    return text_list

def preprocess_single_tweet(text: str) -> list:
    # Lowercase the tweets
    lc_text = text.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    lc_text = re.sub(url_pattern, ' ', lc_text)

    # Remove username from the tweet text
    lc_text = re.sub(user_pattern, ' ', lc_text)

    # Remove all non-alphanumeric symbols
    lc_text = re.sub(alpha_pattern, ' ', lc_text)

    # Replace all 3 or more consecutive letters with 2 letters
    lc_text = re.sub(sequence_pattern, seq_replace_pattern, lc_text)


    
    processed_text = []
    for word in lc_text.split():
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
            word = stem_and_lemmatize(word)
            processed_text.append(word)

    return processed_text


# DB connector
def mongo_connect(server_name: str) -> MongoClient:
    """Creates connection to the MongoDB database with given server name."""
    client = MongoClient(server_name)
    db = client.twitter_db
    return db

# Credential loader
def load_db_credentials(file_path: str) -> (str, str):
    """Loads username and password from YAML file."""
    with open(file_path) as f:
        key_data = yaml.safe_load(f)
        username = key_data['mongo-db']['username']
        passwd = key_data['mongo-db']['passwd']
    return (username, passwd)


In [17]:
# Import NLTK and download wordnet
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/maelstro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
# Extract data from MongoDB
# Load credentials
username, passwd = load_db_credentials('read_only.yaml')

# Connect user to MongoDB database
db = mongo_connect(f"mongodb+srv://{username}:{passwd}@tweetdb.kpcmn.mongodb.net/<dbname>?retryWrites=true&w=majority")

# Dataframe for all Tweets
df_tweets = pd.DataFrame(columns=['_id',
                                  'tweet_text',
                                  'username',
                                  'created_at'])

# List of archetypes
#TODO: Migrate list to single file
archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

# Get all tweets from the database
for archetype in archetype_list:
    # Create a cursor for acquiring all posts from the collection
    cursor = db[archetype].find()
    
    df_archetype = pd.DataFrame(list(cursor))
    df_archetype['archetype'] = archetype
    df_tweets = df_tweets.append(df_archetype, ignore_index=True)

print(df_tweets)

                            _id  \
0      5f9f1c36b38e10f823bf2cdc   
1      5f9f1c36b38e10f823bf2cdd   
2      5f9f1c36b38e10f823bf2cde   
3      5f9f1c36b38e10f823bf2cdf   
4      5f9f1c36b38e10f823bf2ce0   
...                         ...   
41109  5fd4fa09c4b04e4a1cf8c6a5   
41110  5fd4fa09c4b04e4a1cf8c6a6   
41111  5fd4fa09c4b04e4a1cf8c6a7   
41112  5fd4fa09c4b04e4a1cf8c6a8   
41113  5fd4fa09c4b04e4a1cf8c6a9   

                                              tweet_text      username  \
0      @AndruEdwards The hard work has paid off, this...    LEGO_Group   
1      @soosupersam A great way to surprise your love...    LEGO_Group   
2      You can now just bring the fun home, and reliv...    LEGO_Group   
3      @at_knb Happy birthday to the master builder! ...    LEGO_Group   
4                                          @dizunatsu 😀😀    LEGO_Group   
...                                                  ...           ...   
41109  Join me in making the holidays a little bit sw...  Sof

In [20]:
# Preprocess texts
texts = df_tweets['tweet_text']

processed_texts = []
for tweet in texts:
    processed_texts.append(preprocess_single_tweet(tweet))


In [21]:
df_tweets = df_tweets.assign(processed_tweet = processed_texts)

print(df_tweets.head())

                        _id  \
0  5f9f1c36b38e10f823bf2cdc   
1  5f9f1c36b38e10f823bf2cdd   
2  5f9f1c36b38e10f823bf2cde   
3  5f9f1c36b38e10f823bf2cdf   
4  5f9f1c36b38e10f823bf2ce0   

                                          tweet_text    username  \
0  @AndruEdwards The hard work has paid off, this...  LEGO_Group   
1  @soosupersam A great way to surprise your love...  LEGO_Group   
2  You can now just bring the fun home, and reliv...  LEGO_Group   
3  @at_knb Happy birthday to the master builder! ...  LEGO_Group   
4                                      @dizunatsu 😀😀  LEGO_Group   

           created_at timestamp archetype  \
0 2020-11-01 19:32:05       NaT    artist   
1 2020-11-01 19:09:40       NaT    artist   
2 2020-11-01 14:00:36       NaT    artist   
3 2020-10-31 17:16:57       NaT    artist   
4 2020-10-31 15:18:50       NaT    artist   

                                     processed_tweet  
0                          [hard, work, pay, awesom]  
1                      

In [22]:
AGDS_word_to_tweet = {}
AGDS_tweet_to_archetype = {}

for row in df_tweets.itertuples(index=True):
    # Tweet to archetype layer 
    if row.archetype not in AGDS_tweet_to_archetype:
        AGDS_tweet_to_archetype[archetype] = [int(row.Index)]
    else:
        AGDS_tweet_to_archetype[archetype].append(int(row.Index))

    # Word to tweet layer
    for word in row.processed_tweet:
        if word not in AGDS_word_to_tweet:
            AGDS_word_to_tweet[word] = [int(row.Index)]
        else:
            AGDS_word_to_tweet[word].append(int(row.Index))

In [23]:
from sys import getsizeof
print("AGDS representation size: ", getsizeof(AGDS_word_to_tweet))
print("Matrix representation size: ", getsizeof(df_tweets))
print("AGDS needs ~", round(getsizeof(df_tweets)/getsizeof(AGDS_word_to_tweet), 2), "times less space than standard matrix representation.")

AGDS representation size:  589920
Matrix representation size:  26591793
AGDS needs ~ 45.08 times less space than standard matrix representation.


In [24]:
print(AGDS_tweet_to_archetype)
print(AGDS_word_to_tweet)

9810, 39812, 40275, 40561], 'puglian': [39651, 39807, 39812, 39813, 39814, 39817, 39820, 40045, 40287, 40288, 40289, 40291, 40550], 'laurissa': [39657, 39891], 'strappi': [39659, 39893, 40359], 'bras': [39660, 39873, 39894, 40308, 40323, 40359, 40359, 40646, 40661, 40881], 'kailey': [39662, 39896], 'kayden': [39667, 39901], 'jalen': [39673, 39907], 'gourmand': [39675], 'carolinian': [39678], 'botan': [39680], 'asha': [39681], 'aayush': [39682], 'foldov': [39685], 'flare': [39685], 'succul': [39691], 'unto': [39693], 'inga': [39697], 'orchid': [39707], 'orri': [39707], 'patchouli': [39707], 'earthi': [39707], 'panti': [39714, 39720, 39722, 39723, 40100, 40103, 40103], 'tasha': [39719], 'agen': [39736], 'daz': [39736], 'welcomebackimola': [39739], 'kimi': [39739], 'imolagp': [39739], 'kimir': [39740], 'alfaromeo': [39740, 39932, 40158, 40405, 40682, 40988, 40989, 40990], 'myalfa': [39741, 39938, 40162, 40410, 40690, 40996], 'nescaf': [39746, 39946, 39949, 40168, 40170, 40171, 40425, 4042

#### Defined AGDS
Now we have 2 layer AGDS structure - one assigns word input to tweet, the second one assigns tweet to archetype

In [75]:
import itertools
import operator

def get_jaccard_similarity(text_A, text_B) -> float:
    intersection = len(list(set(text_A).intersection(text_B)))
    union = (len(text_A) + len(text_B)) - intersection
    return float(intersection) / union

def find_nearest_neighbors(row, k):
    ### PART 1 - associate words with Tweets and get the Jaccard similarity
    similar_tweets = []

    for word in row:
        try:
            similar_tweets.append(AGDS_word_to_tweet[word])
        except:
            pass
    
    similar_tweets = set(itertools.chain.from_iterable(similar_tweets))

    tweet_similarity = []
    for s_tweet in similar_tweets:
        tweet_similarity.append((s_tweet, get_jaccard_similarity(row, df_tweets.iloc[s_tweet].processed_tweet)))
    tweet_similarity = sorted(tweet_similarity, key=lambda x: x[1])
    tweet_similarity = tweet_similarity[-k:]

    ### PART 2 - associate Tweets with archetypes
    similar_archetypes = {}
    similarity_sum = 0

    for pair in tweet_similarity:
        idx, jacc = pair
        similarity_sum += jacc

        if df_tweets.iloc[idx].archetype in similar_archetypes:
            similar_archetypes[df_tweets.iloc[idx].archetype] += jacc
        else:
            similar_archetypes[df_tweets.iloc[idx].archetype] = jacc

    for key in similar_archetypes.keys():
        similar_archetypes[key] /= similarity_sum

    return similar_archetypes

## K-Nearest Neighbors
To check how does the AGDS work with multi-class multi-label classification, I tried to apply K-Nearest Neighbors to my Twitter dataset and look on the results. It's probably a place for an intelligent classificator, ie. neural networks.

In [80]:
print(f'Result of K-NN on the dataset for tweet "{df_tweets.iloc[0].tweet_text}":\n{find_nearest_neighbors(processed_texts[0], 1000)}')

Result of K-NN on the dataset for tweet "@AndruEdwards The hard work has paid off, this is awesome! 😎✔":
{'everyman': 0.08671402460802659, 'jester': 0.07605064013881802, 'guru': 0.061722364705922794, 'rebel': 0.04784560898788217, 'artist': 0.18831259508332182, 'ruler': 0.18688210507737912, 'hero': 0.06611778079677323, 'explorer': 0.0651994549948477, 'caregiver': 0.025277094631883604, 'magician': 0.09511478630744857, 'innocent': 0.08988717515115686, 'seducer': 0.010876369516530914}


In [79]:
# Random Tweet from Twitter, not present in dataset
example_tweet = "You know those moments when friends ask you if you’ve “seen this movie?” and you say “no” and they say “you should watch it!” and you say “ok” and you add it to a list you never get to?"
print(f'Result of K-NN on the dataset for tweet "{example_tweet}":\n{find_nearest_neighbors(preprocess_single_tweet(example_tweet), 100)}')

Result of K-NN on the dataset for tweet "You know those moments when friends ask you if you’ve “seen this movie?” and you say “no” and they say “you should watch it!” and you say “ok” and you add it to a list you never get to?":
{'ruler': 0.14447829173745494, 'artist': 0.17528074360829676, 'magician': 0.10447687495518036, 'guru': 0.05247817378858575, 'everyman': 0.13024277052820685, 'jester': 0.1294366702685274, 'rebel': 0.03906969050809967, 'explorer': 0.07885387897364247, 'innocent': 0.051398487986227225, 'hero': 0.03253713775433381, 'caregiver': 0.01934640623230659, 'seducer': 0.04240087365913861}
