In [1]:
# Twitter topic extraction
# First approach to LDA

# Dependencies
import yaml
import nltk
import gensim
from pymongo import MongoClient
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pandas as pd
np.random.seed(400)

In [2]:
# Import NLTK and download wordnet
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/maelstro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
### FUNCTION DEFINITIONS

# Create a stemmer
stemmer = SnowballStemmer("english")

# Functions for stemming and lemmatization
def stem_and_lemmatize(text:str) -> str:
    """Stems and lemmatizes a given text."""
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess_texts(text_list: pd.DataFrame) -> pd.DataFrame:
    """Processes text to remove all unwanted words and symbols."""

    # Lowercase the tweets
    text_list['processed_tweet'] = text_list['tweet_text'].str.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    text_list['processed_tweet'] = [re.sub(url_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove username from the tweet text
    text_list['processed_tweet'] = [re.sub(user_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove all non-alphanumeric symbols
    text_list['processed_tweet'] = [re.sub(alpha_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Replace all 3 or more consecutive letters with 2 letters
    text_list['processed_tweet'] = [re.sub(sequence_pattern, seq_replace_pattern, str(x))
                                    for x in text_list['processed_tweet']]

    
    full_tweet_list = []
    for x in text_list['processed_tweet']:
        full_tweet = ''
        for word in x.split():
            word = stem_and_lemmatize(word)
            full_tweet += (word + ' ')
        full_tweet_list.append(full_tweet)

    text_list['processed_tweet'] = full_tweet_list

    return text_list

def preprocess_single_tweet(text: str) -> list:
    # Lowercase the tweets
    lc_text = text.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z0-9]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    lc_text = re.sub(url_pattern, ' ', lc_text)

    # Remove username from the tweet text
    lc_text = re.sub(user_pattern, ' ', lc_text)

    # Remove all non-alphanumeric symbols
    lc_text = re.sub(alpha_pattern, ' ', lc_text)

    # Replace all 3 or more consecutive letters with 2 letters
    lc_text = re.sub(sequence_pattern, seq_replace_pattern, lc_text)


    
    processed_text = []
    for word in lc_text.split():
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
            word = stem_and_lemmatize(word)
            processed_text.append(word)

    return processed_text


# DB connector
def mongo_connect(server_name: str) -> MongoClient:
    """Creates connection to the MongoDB database with given server name."""
    client = MongoClient(server_name)
    db = client.twitter_db
    return db

# Credential loader
def load_db_credentials(file_path: str) -> (str, str):
    """Loads username and password from YAML file."""
    with open(file_path) as f:
        key_data = yaml.safe_load(f)
        username = key_data['mongo-db']['username']
        passwd = key_data['mongo-db']['passwd']
    return (username, passwd)


In [4]:
# Extract data from MongoDB
# Load credentials
username, passwd = load_db_credentials('read_only.yaml')

# Connect user to MongoDB database
db = mongo_connect(f"mongodb+srv://{username}:{passwd}@tweetdb.kpcmn.mongodb.net/<dbname>?retryWrites=true&w=majority")

# Dataframe for all Tweets
df_tweets = pd.DataFrame(columns=['_id',
                                  'tweet_text',
                                  'username',
                                  'created_at'])

# List of archetypes
#TODO: Migrate list to single file
archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

# Get all tweets from the database
for archetype in archetype_list:
    # Create a cursor for acquiring all posts from the collection
    cursor = db[archetype].find()
    
    df_archetype = pd.DataFrame(list(cursor))
    df_archetype['archetype'] = archetype
    df_tweets = df_tweets.append(df_archetype, ignore_index=True)

In [5]:
# Preprocess texts
texts = df_tweets['tweet_text']

processed_texts = []
for tweet in texts:
    processed_texts.append(preprocess_single_tweet(tweet))

print(processed_texts[:5])

[['hard', 'work', 'pay', 'awesom'], ['great', 'surpris', 'love'], ['bring', 'home', 'reliv', 'favorit', 'childhood', 'memori', 'sesam', 'street'], ['happi', 'birthday', 'master', 'builder', 'hope', 'magic'], []]


In [6]:
# Get the occurrence count of words
dictionary = gensim.corpora.Dictionary(processed_texts)

# Convert the dictionary into a bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_texts]

In [7]:
# Training a LDA model
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                      num_topics=12,
                                      id2word=dictionary,
                                      passes=10,
                                      workers=2)

# Print the words occuring in each topic
for idx, topic, in lda_model.print_topics(-1):
    print(f'Topic: {idx}, \nWords: {topic}\n')

Topic: 0, 
Words: 0.028*"right" + 0.018*"peopl" + 0.015*"0800" + 0.013*"work" + 0.012*"human" + 0.009*"covid19" + 0.009*"world" + 0.009*"worker" + 0.008*"global" + 0.008*"chang"

Topic: 1, 
Words: 0.026*"great" + 0.020*"thanksgiv" + 0.016*"friend" + 0.014*"today" + 0.014*"game" + 0.013*"watch" + 0.013*"happi" + 0.011*"beauti" + 0.011*"discov" + 0.011*"famili"

Topic: 2, 
Words: 0.049*"sorri" + 0.044*"order" + 0.043*"send" + 0.039*"look" + 0.038*"thank" + 0.038*"address" + 0.035*"email" + 0.033*"hear" + 0.030*"number" + 0.026*"help"

Topic: 3, 
Words: 0.086*"thank" + 0.051*"team" + 0.048*"share" + 0.046*"know" + 0.021*"help" + 0.020*"custom" + 0.018*"love" + 0.018*"sure" + 0.017*"reach" + 0.015*"care"

Topic: 4, 
Words: 0.014*"merced" + 0.011*"pack" + 0.009*"class" + 0.009*"insid" + 0.009*"chang" + 0.008*"level" + 0.008*"photo" + 0.008*"maybach" + 0.006*"learn" + 0.006*"congrat"

Topic: 5, 
Words: 0.025*"book" + 0.021*"thing" + 0.017*"jone" + 0.013*"leav" + 0.011*"tiffani" + 0.010*"dior

In [13]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  and should_run_async(code)


In [17]:
# Test predictions

# Data preprocessing step for the unseen document
bow_vector = bow_corpus[1]
print(df_tweets['tweet_text'][1])

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

@soosupersam A great way to surprise your loved one! 🎁🥰
Score: 0.7708227038383484	 Topic: 0.056*"love" + 0.033*"like" + 0.032*"good" + 0.020*"enjoy" + 0.019*"glad"
Score: 0.020836254581809044	 Topic: 0.026*"great" + 0.020*"thanksgiv" + 0.016*"friend" + 0.014*"today" + 0.014*"game"
Score: 0.020834656432271004	 Topic: 0.025*"book" + 0.021*"thing" + 0.017*"jone" + 0.013*"leav" + 0.011*"tiffani"
Score: 0.020834222435951233	 Topic: 0.086*"thank" + 0.051*"team" + 0.048*"share" + 0.046*"know" + 0.021*"help"
Score: 0.020834054797887802	 Topic: 0.036*"free" + 0.026*"happi" + 0.024*"feel" + 0.024*"need" + 0.021*"help"
Score: 0.020834054797887802	 Topic: 0.015*"year" + 0.013*"time" + 0.013*"launch" + 0.012*"learn" + 0.009*"make"
Score: 0.02083403244614601	 Topic: 0.040*"holiday" + 0.027*"shop" + 0.026*"season" + 0.020*"gift" + 0.016*"chanel"
Score: 0.020834024995565414	 Topic: 0.013*"elect" + 0.012*"join" + 0.011*"presid" + 0.011*"decemb" + 0.010*"say"
Score: 0.020834019407629967	 Topic: 0.049*"s

  and should_run_async(code)


In [18]:
# Allocation of topics in document
topics = [lda_model[bow_corpus[i]] for i in range(len(bow_corpus))]

  and should_run_async(code)
  self._context.run(self._callback, *self._args)


In [23]:
topics = lda_model.show_topics(formatted=True, num_topics=12, num_words=20)
pd.DataFrame([(el[0], round(el[1],2), topics[el[0]][1]) for el in lda_model[bow_corpus[1]]], columns=['topic #', 'weight', 'words in topic'])

  and should_run_async(code)
  self._context.run(self._callback, *self._args)


Unnamed: 0,topic #,weight,words in topic
0,0,0.02,"0.028*""right"" + 0.018*""peopl"" + 0.015*""0800"" +..."
1,1,0.02,"0.026*""great"" + 0.020*""thanksgiv"" + 0.016*""fri..."
2,2,0.02,"0.049*""sorri"" + 0.044*""order"" + 0.043*""send"" +..."
3,3,0.02,"0.086*""thank"" + 0.051*""team"" + 0.048*""share"" +..."
4,4,0.02,"0.014*""merced"" + 0.011*""pack"" + 0.009*""class"" ..."
5,5,0.02,"0.025*""book"" + 0.021*""thing"" + 0.017*""jone"" + ..."
6,6,0.77,"0.056*""love"" + 0.033*""like"" + 0.032*""good"" + 0..."
7,7,0.02,"0.036*""free"" + 0.026*""happi"" + 0.024*""feel"" + ..."
8,8,0.02,"0.015*""year"" + 0.013*""time"" + 0.013*""launch"" +..."
9,9,0.02,"0.040*""holiday"" + 0.027*""shop"" + 0.026*""season..."


In [24]:
def topics_document_to_dataframe(topics_document, num_topics):
    res = pd.DataFrame(columns=range(num_topics))
    for topic_weight in topics_document:
        res.loc[0, topic_weight[0]] = topic_weight[1]
    return res

topics_document_to_dataframe([(9, 0.03853655432967504), (15, 0.09130117862212643), (18, 0.8692868808484044)], 20)

  and should_run_async(code)
