In [17]:
# Script to check out the acquired text
# and try to associate the texts with 
# the brand archetypes

import re
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
from wordcloud import WordCloud

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN

In [18]:
# Functions used in the notebook
def mongo_connect(server_name: str) -> MongoClient:
    """Creates connection to the MongoDB database with given server name."""
    client = MongoClient(server_name)
    db = client.twitter_db
    return db

word_lemm = WordNetLemmatizer()

# Tweet preprocessing
def preprocess_texts(text_list: pd.DataFrame):
    """Processes text to remove all unwanted words and symbols."""

    # Lowercase the tweets
    text_list['processed_tweet'] = text_list['tweet_text'].str.lower()

    # Regex patterns
    url_pattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern       = '@[^\s]+'
    alpha_pattern      = "[^a-zA-Z0-9]"
    sequence_pattern   = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    # Remove URLs from the tweet text
    text_list['processed_tweet'] = [re.sub(url_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove username from the tweet text
    text_list['processed_tweet'] = [re.sub(user_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Remove all non-alphanumeric symbols
    text_list['processed_tweet'] = [re.sub(alpha_pattern, ' ', str(x))
                                    for x in text_list['processed_tweet']]
    # Replace all 3 or more consecutive letters with 2 letters
    text_list['processed_tweet'] = [re.sub(sequence_pattern, seq_replace_pattern, str(x))
                                    for x in text_list['processed_tweet']]

    full_tweet = ''
    full_tweet_list = []
    for x in text_list['processed_tweet']:
        for word in x.split():
            if word not in stopwords.words('english'):
                if len(word) > 1:
                    word = word_lemm.lemmatize(word)
                    full_tweet += (word + ' ')
        full_tweet_list.append(full_tweet)

    text_list['processed_tweet'] = full_tweet_list

    return text_list

In [19]:
# Connect to local database
db = mongo_connect('localhost')

# Cursor for acquiring all posts
cursor = db.twitter_posts.find()

df = pd.DataFrame(list(cursor))


In [20]:
df.head()

Unnamed: 0,_id,tweet_text,username,created_at
0,5f8972f59848a997cb675eb9,They worked with youth climate activists in th...,Virgin,2020-10-16 09:57:21
1,5f8972f59848a997cb675eba,.@EnvisionVirgin signing the framework further...,Virgin,2020-10-16 09:57:21
2,5f8972f59848a997cb675ebb,The framework calls on sporting organisations ...,Virgin,2020-10-16 09:57:19
3,5f8972f59848a997cb675ebc,.@HollyBranson and CEO of @BransonCentreCA Lau...,Virgin,2020-10-15 18:30:29
4,5f8972f59848a997cb675ebd,“50 years eh? Who’d have thought........Actual...,Virgin,2020-10-15 16:42:17


In [21]:
import time

df_processed = preprocess_texts(df)

# TF-IDF Vectorization - CountVectorize (Bag of Words), and then apply IF-IDF Transformer
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=800000)
start = time.time()
vectorizer.fit(df_processed['processed_tweet'])
print(f'Vectorizer fitting ended in {round(time.time()-start)} seconds')
print(f'Number of feature_words: {len(vectorizer.get_feature_names())}')

Vectorizer fitting ended in 10 seconds
Number of feature_words: 16197


In [22]:
NUM_CLUSTERS = 12

transformed_tweets = vectorizer.transform(df_processed['processed_tweet'])

kmpp_model = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1, max_iter=1000, tol=1e-5,
                    random_state=2200)
kmpp_model.fit(transformed_tweets)

def show_training_results(model, n_clusters):
    order_centroids = model.cluster_centers_.argsort()[:,::-1]
    terms = vectorizer.get_feature_names()
    
    for i in range(n_clusters):
        term_list = terms[]
        print("Cluster %d:" % i)
        for ind in order_centroids[i, :20]:
            print(' %s' % terms[ind])

show_training_results(kmpp_model, NUM_CLUSTERS)

Cluster 0:
 dm
 please
 hi
 thanks
 done without
 davidson apolitical
 image projected
 consent harley
 without consent
 harley
 harley davidson
 apolitical
 projected
 projected museum
 davidson
 museum done
 consent
 help
 address
 number
Cluster 1:
 team
 help
 chanel
 thanks
 please
 hi
 dm
 new
 look
 time
 hear
 year
 like
 one
 amp
 sorry
 experience
 see
 latest
 collection
Cluster 2:
 hi
 done without
 without consent
 image projected
 consent
 apolitical
 davidson
 projected
 projected museum
 harley davidson
 consent harley
 harley
 davidson apolitical
 museum done
 image
 without
 museum
 done
 apolitical image
 thanks
Cluster 3:
 team
 year
 disneyplus
 please
 number
 first
 dm
 dyson
 change
 hi
 streaming
 new
 look
 real
 know
 great
 thanks
 hey
 20
 one
Cluster 4:
 chanel
 team
 dm
 please
 experience
 help
 year
 sorry
 learn
 new
 thanks
 look
 hear
 number
 gabrielle
 gabrielle chanel
 great
 latest
 time
 life
Cluster 5:
 please
 dm
 hi
 thanks
 help
 sorry
 team