In [50]:
# Twitter - tweet analysis (date: 8.11.2020)
import re
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
from wordcloud import WordCloud
import collections
import pickle

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN

In [112]:
# Functions used in the notebook
def mongo_connect(server_name: str) -> MongoClient:
    """Creates connection to the MongoDB database with given server name."""
    client = MongoClient(server_name)
    db = client.twitter_db
    return db

word_lemm = WordNetLemmatizer()
stopwords_eng = stopwords.words('english')

# Tweet preprocessing
def preprocess_texts(text_list: pd.DataFrame):
    """Processes text to remove all unwanted words and symbols."""

    # Lowercase the tweets
    text_list['processed_tweet'] = text_list['tweet_text'].str.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z0-9]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    text_list['processed_tweet'] = [re.sub(url_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove username from the tweet text
    text_list['processed_tweet'] = [re.sub(user_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove all non-alphanumeric symbols
    text_list['processed_tweet'] = [re.sub(alpha_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Replace all 3 or more consecutive letters with 2 letters
    text_list['processed_tweet'] = [re.sub(sequence_pattern, seq_replace_pattern, str(x))
                                    for x in text_list['processed_tweet']]

    
    full_tweet_list = []
    for x in text_list['processed_tweet']:
        full_tweet = ''
        for word in x.split():
            word = word_lemm.lemmatize(word)
            full_tweet += (word + ' ')
        full_tweet_list.append(full_tweet)

    text_list['processed_tweet'] = full_tweet_list

    return text_list

def get_most_frequent_words(docs: list, most_frequent_count: int = 10) -> dict:
    """Gets the most frequent words from the dataset"""
    word_count = {}
    
    for doc in docs:
        for word in doc.lower().split():
            if word not in word_count:
                word_count[word] = 1
            elif word in word_count:
                word_count[word] += 1
            
    word_counter = collections.Counter(word_count)
    
    final_word_count = {}
    
    for word, count in word_counter.most_common(most_frequent_count):
        final_word_count[word] = count
    
    final_word_count = pd.DataFrame({'Word': list(final_word_count.keys()), 'Word usage count': list(final_word_count.values())}, columns=['Word', 'Word usage count'])
    return final_word_count

def get_most_frequent_ngrams(docs: list, most_frequent_count: int = 10, ngram_count: int = 2) -> dict:
    """Gets the most frequent words from the dataset""" 
    final_ngram_count = {}
    
    for doc in docs:
        ngram_counter = collections.Counter(nltk.ngrams(doc.split(), ngram_count))
        for ngram, count in ngram_counter.most_common(most_frequent_count):
            if ngram not in final_ngram_count:
                final_ngram_count[ngram] = 1
            else:
                final_ngram_count[ngram] += int(count)
    
    final_ngram_count = pd.DataFrame({'Term': list(final_ngram_count.keys())[:most_frequent_count], 
                                      'Term usage count': sorted(list(final_ngram_count.values())[:most_frequent_count])[::-1]}, 
                                     columns=['Term', 'Term usage count'])
    return final_ngram_count

In [113]:
# Connect to local database
db = mongo_connect('localhost')

# Cursor for acquiring all posts
cursor = db['artist'].find()

artist_list = pd.DataFrame(list(cursor))


In [114]:
# Show the array 
artist_list.head()

Unnamed: 0,_id,tweet_text,username,created_at
0,5f9f1c36b38e10f823bf2cdc,"@AndruEdwards The hard work has paid off, this...",LEGO_Group,2020-11-01 19:32:05
1,5f9f1c36b38e10f823bf2cdd,@soosupersam A great way to surprise your love...,LEGO_Group,2020-11-01 19:09:40
2,5f9f1c36b38e10f823bf2cde,"You can now just bring the fun home, and reliv...",LEGO_Group,2020-11-01 14:00:36
3,5f9f1c36b38e10f823bf2cdf,@at_knb Happy birthday to the master builder! ...,LEGO_Group,2020-10-31 17:16:57
4,5f9f1c36b38e10f823bf2ce0,@dizunatsu 😀😀,LEGO_Group,2020-10-31 15:18:50


In [115]:
# Preprocess the texts
artist_list_processed = preprocess_texts(artist_list)
docs = artist_list_processed['processed_tweet'].tolist()

print(artist_list['tweet_text'][1])
print(docs[:100])

@soosupersam A great way to surprise your loved one! 🎁🥰
['the hard work ha paid off this is awesome ', 'a great way to surprise your loved one ', 'you can now just bring the fun home and relive your favorite childhood memory on sesame street ', 'happy birthday to the master builder we hope she had a magical day ', '', '', 'this is the way ', 'time to add a bit of legodots hocus pocus to your animal crossing island scan these code with your nooklink app to add to your collection acnh dotyourworld ', 'this is how you make all the ninja in the neighborhood jealous they look ninja tastic ', 'boo ', 'what a spooky ride ', 'these brick o lantern are certainly all treat and no trick get building this halloween rebuildtheworld ', '', 'u when we first saw the child set thisistheway legostarwars themandalorien thechild ', 'wow what a cool lego tower keep building and it ll be over your head in no time we can t wait to see what you use your imagination to create next rebuildtheworld ', 'the best 

In [116]:
freq_list = get_most_frequent_words(docs, 100)
print(freq_list)

ngram_list = get_most_frequent_ngrams(docs, 100, 2)
print(ngram_list)

      Word  Word usage count
0      the              1694
1       to              1075
2      you               961
3        a               801
4      and               710
..     ...               ...
95     way                51
96    just                51
97   sorry                51
98  repair                51
99   these                50

[100 rows x 2 columns]
                Term  Term usage count
0        (the, hard)               128
1       (hard, work)               110
2         (work, ha)                89
3         (ha, paid)                36
4        (paid, off)                32
..               ...               ...
95         (and, no)                 1
96       (no, trick)                 1
97      (trick, get)                 1
98   (get, building)                 1
99  (building, this)                 1

[100 rows x 2 columns]


In [117]:
# Create pipeline for getting most frequent words from the collection
archetype_list = ['artist',
                 'caregiver',
                 'everyman',
                 'explorer',
                 'guru',
                 'hero',
                 'innocent',
                 'jester',
                 'magician',
                 'rebel',
                 'ruler',
                 'seducer']

for archetype in archetype_list:
    # Create a cursor for acquiring all posts from the collection
    cursor = db[archetype].find()
    
    # Create a DataFrame with all the tweet text
    df_name = archetype + "_list"
    globals()[df_name] = pd.DataFrame(list(cursor))
    
    # Preprocess the texts
    globals()[df_name+"_processed"] = preprocess_texts(globals()[df_name])
    
    # Create a list out of processed tweets
    docs = globals()[df_name+"_processed"]['processed_tweet'].tolist()
    
    # Get the list of the most frequent words
    freq_list = get_most_frequent_words(docs, 1000)
    ngram_list = get_most_frequent_ngrams(docs, 100, 2)
    print(f'The most frequent words and bigrams for "{archetype}" archetype:')
    print(freq_list)
    print(ngram_list)
    
    # Save the most frequent word list to a Pickle
    file_path = f'archetype_freq_15112020/{archetype}_single.pickle'
    with open(file_path, "wb") as out_file:
        pickle.dump(freq_list, out_file)
        
    file_path = f'archetype_freq_15112020/{archetype}_ngrams.pickle'
    with open(file_path, "wb") as out_file:
        pickle.dump(ngram_list, out_file)   

The most frequent words and bigrams for "artist" archetype:
             Word  Word usage count
0             the              1694
1              to              1075
2             you               961
3               a               801
4             and               710
..            ...               ...
995         ninja                 4
996  legostarwars                 4
997      thechild                 4
998         proud                 4
999       teacher                 4

[1000 rows x 2 columns]
                Term  Term usage count
0        (the, hard)               128
1       (hard, work)               110
2         (work, ha)                89
3         (ha, paid)                36
4        (paid, off)                32
..               ...               ...
95         (and, no)                 1
96       (no, trick)                 1
97      (trick, get)                 1
98   (get, building)                 1
99  (building, this)                 1

[100 rows x 2 