Below we will compare the top topics of the tweet library, based on the tweet author's influence.  The first group is the Top 100 authors by number of "Likes" within the dataset. 

In [104]:
import pandas as pd

import cleantext  
from emoji import demojize
import re
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from bertopic import BERTopic

import os
os.chdir('C:\\Data\\')

import pyodbc
sServer = 'localhost'
sDB = 'CUNY'
cnxn = pyodbc.connect("Driver={SQL Server Native Client 11.0};"
                      "Server=" + sServer + ";"
                      "Database=" + sDB + ";"
                      "Trusted_Connection=yes;") 


In [88]:
# Changes text to lower case
# Removes:
#    numbers and punctuation 
#    extra spaces
# Translates emoji's into phrases 
def clean_text(x):
    x = demojize(x, language='alias') 
    x = re.sub(r"[:]+\ *", " ", x) #removes emoji colons and separates them with a space
    return cleantext.clean(x, extra_spaces=True, lowercase=True, numbers=True, punct=True, stopwords=True,
                     reg=r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", reg_replace=' ')

In [89]:
#Function to Lemmatize text (convert various forms to root words) 
def lemmatize_word(text):
    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word) for word in text]
    return lemma

In [90]:
#Rationalize the text: clean, tokenize and lemmatize 
def rationalize_text(txt):
    return txt.apply(lambda x: clean_text(x)).apply(word_tokenize).apply(lambda x: lemmatize_word(x)).apply(lambda x: ''.join(i+' ' for i in x))


The first group consists of the Top 100 authors by number of "Likes" within the dataset. 

In [91]:
#how the text file is created:
filename = "SourcesTop.csv"
sSQL = """SELECT UserContent 
          FROM elonmusktwitter_tweets a 
          INNER JOIN tbl_Musk b ON a.UserName = b.UserName 
          WHERE a.UserLanguage = 'en' AND b.LikeCount > 70000 """
df = pd.read_sql_query(sSQL, cnxn)
df.columns = ['UserContent']
df.to_csv(filename, encoding='utf-8')

In [94]:
#create model and show top 8 topics
df = pd.read_csv(filename)
df['UserContent'] = rationalize_text(df['UserContent'])
docs = df['UserContent'].tolist()
model = BERTopic(verbose=True)
topics, probabilities = model.fit_transform(docs)
df = model.get_topic_info()
df.to_csv('topic_info_'+filename, encoding='utf-8')


Batches:   0%|          | 0/84 [00:00<?, ?it/s]

2022-05-23 18:55:23,011 - BERTopic - Transformed documents to Embeddings
2022-05-23 18:55:31,114 - BERTopic - Reduced dimensionality
2022-05-23 18:55:31,235 - BERTopic - Clustered reduced embeddings


In [None]:
model.visualize_barchart()

The second group is authors who garnered between 1,000 and 70,000 Likes (Ranked 101 - 2779). 

In [101]:
#how the text file is created:
filename = "SourcesMid.csv"
sSQL = """SELECT UserContent 
          FROM elonmusktwitter_tweets a 
          INNER JOIN tbl_Musk b ON a.UserName = b.UserName 
          WHERE a.UserLanguage = 'en' AND b.LikeCount BETWEEN 1000 AND 70000 """
df = pd.read_sql_query(sSQL, cnxn)
df.columns = ['UserContent']
df.to_csv(filename, encoding='utf-8')

In [102]:
#create model and show top 8 topics
df = pd.read_csv(filename)
df['UserContent'] = rationalize_text(df['UserContent'])
docs = df['UserContent'].tolist()
model = BERTopic(verbose=True)
topics, probabilities = model.fit_transform(docs)
df = model.get_topic_info()
df.to_csv('topic_info_'+filename, encoding='utf-8')


Batches:   0%|          | 0/511 [00:00<?, ?it/s]

2022-05-23 19:15:51,888 - BERTopic - Transformed documents to Embeddings
2022-05-23 19:15:58,028 - BERTopic - Reduced dimensionality
2022-05-23 19:15:58,798 - BERTopic - Clustered reduced embeddings


In [103]:
model.visualize_barchart()

The third group consists of authors who garnered between 100 and 999 Likes (Ranked 2780 - 12884). 


In [98]:
#how the text file is created:
filename = "SourcesLow.csv"
sSQL = """SELECT UserContent 
          FROM elonmusktwitter_tweets a 
          INNER JOIN tbl_Musk b ON a.UserName = b.UserName 
          WHERE a.UserLanguage = 'en' AND b.LikeCount BETWEEN 100 AND 999 """
df = pd.read_sql_query(sSQL, cnxn)
df.columns = ['UserContent']
df.to_csv(filename, encoding='utf-8')

In [99]:
#create model and show top 8 topics
df = pd.read_csv(filename)
df['UserContent'] = rationalize_text(df['UserContent'])
docs = df['UserContent'].tolist()
model = BERTopic(verbose=True)
topics, probabilities = model.fit_transform(docs)
df = model.get_topic_info()
df.to_csv('topic_info_'+filename, encoding='utf-8')


Batches:   0%|          | 0/1379 [00:00<?, ?it/s]

2022-05-23 19:11:36,952 - BERTopic - Transformed documents to Embeddings
2022-05-23 19:11:52,745 - BERTopic - Reduced dimensionality
2022-05-23 19:11:56,344 - BERTopic - Clustered reduced embeddings


In [100]:
model.visualize_barchart()