In [1]:
import tweepy 
import configparser
import requests     # For saving access tokens and for file management when creating and adding to the dataset
import os           # For dealing with json responses we receive from the API
import json         # For displaying the data after
import pandas as pd # For saving the response data in CSV format
import csv          # For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata  #To add wait time between requests
import time
import sqlite3
import re
import twitter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance,PartOfSpeech
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.corpus import stopwords
from sklearn.model_selection import RandomizedSearchCV
from flair.embeddings import TransformerDocumentEmbeddings
import numpy as np
from twitter import *
from functools import partial
from collections import Counter
import nltk
import string
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [10]:
model                   = BERTopic.load(r"../model/localBERT")
df_tweets_preprocessed  = pd.read_pickle('../data/df_tweets_preprocessed.pkl')
users_df                = pd.read_pickle('../data/users_df.pkl')
tweets_df               = pd.read_pickle('../data/tweets_df.pkl')
topics                  = model.topics_

In [12]:
# Use the "c-TF-IDF" strategy with a threshold
new_topics = model.reduce_outliers(df_tweets_preprocessed['text_preprocessed'], topics, strategy="c-tf-idf")
model.update_topics(df_tweets_preprocessed['text_preprocessed'], topics=new_topics)

In [14]:
# Assign the updated topics to the DataFrame
df_tweets_preprocessed['updated_topics'] = new_topics
freq = model.get_topic_info()
freq

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,4354,0_viisit_official_giveaway_tesla,"[viisit, official, giveaway, tesla, url, elonm...",[TonyadeVitti elonmusk chance Dont miss itThe ...
1,1,1766,1_twitch_sponsored_stream_game,"[twitch, sponsored, stream, game, live, ad, pl...",[Exploring early game experience RAI Shadow Le...
2,2,2298,2_nike_sneakerscouts_gtgtgt_available,"[nike, sneakerscouts, gtgtgt, available, air, ...",[SneakerScouts Nike Womens Air Max Cobalt Blis...
3,3,1462,3_channel_youtube_do_video,"[channel, youtube, do, video, something, conte...",[ckayyo pabloalboran LeoRizziMusic Get YouTube...
4,4,1516,4_climate_energy_learn_report,"[climate, energy, learn, report, unep, sustain...",[Exciting news 🎉 Energy ’ Bioenergy Tech Offic...
5,5,1261,5_opti_optimus_rely_prepared,"[opti, optimus, rely, prepared, advance, knowl...",[AbbanB CryptoLordH Are prepared use AI advanc...
6,6,1698,6_blockchain_crypto_future_web3,"[blockchain, crypto, future, web3, defi, nft, ...",[CryptoNewsShark digitalassetbuy thecryptobasi...
7,7,1259,7_please_send_dm_like,"[please, send, dm, like, details, wed, hi, hea...",[bplaj63 Good morning Please DM additional det...
8,8,1135,8_eth_ive_bots_mev,"[eth, ive, bots, mev, sandwiching, tutorial, c...",[DRic2hard YouTube Ive made decent amount ETH ...
9,9,1770,9_learn_amp_join_june,"[learn, amp, join, june, register, women, tech...",[Today attending WITWomens Annual Leadership A...


# Which topics are you interested in?

1_twitch_sponsored_stream_game: STREAMING, GAMING - 2
  4_climate_energy_learn_unep	: ENVIRONMENT & ENERGY SUSTAINABILITY - 4
  9_health_research_cancer_dr: HEALTH AND CANCER RESEARCH & INNOVATION - 10
  17_ai_machinelearning_future_cc: ARTIFICIAL INTELLGIENCE RESEARCH & INNOVATION - 17
  22_women_un_amp_learn	: HUMAN RIGHTS, EQUALITY, AND WOMEN EMPOWERMENT - 22
 custom_name_list = {1: "Tesla Giveaway",
                     2: "Streaming & Gaming",
                     3: "Nike Ads",
                     4: "Youtube Education",
                     5: "Environment & Energy Sustainability",
                     6: "Optimus",
                     7: "Crypto" ,
                     8: "Spam",
                     9: "Blockchain Spam",
                     10: "Human Rights & Equality",
                     11: "Health and Cancer Research & Innovation",
                     12: "Online Deals",
                     13: "Amazon Deals",
                     14: "Artificial Intelligence Research & Innovation",
                     15: "Iphone Network",
                     16: "Ford Car Deals",
                        17: "LinkedIn Events",
                    18: "Memes",
  19: "Media",
                     20: "News",
                     21: "Ebay Sports Trading Cards",
                     22: "Shopping"}

In [None]:
import sys

# Define a file path where you want to save the output
output_file_path = 'output.txt'

# Create a file for writing the output
with open(output_file_path, 'w') as output_file:
    # Redirect sys.stdout to the output file
    sys.stdout = output_file

    best_topics = [1, 5, 10, 17, 21]
    for i in best_topics:
        df_tweets_topic_i = df_tweets_preprocessed[df_tweets_preprocessed['updated_topics'] == i]
        df_tweets_topic_i = pd.merge(
            df_tweets_topic_i,
            freq,
            how="left",
            left_on=['updated_topics'],
            right_on=['Topic']
        )
        name = freq[freq['Topic'] == i]['Name']
        print(f"Topic: {name}")
        print(df_tweets_topic_i['text_original'].head(100))

# Reset sys.stdout to its original value to print to the console again
sys.stdout = sys.__stdout__

In [None]:
df_tweets_pp_temp = pd.concat([df_tweets_preprocessed, pd.Series(model.topics_)],
                 axis=1)

print(df_tweets_pp_temp.info())

In [None]:
df_tweets_pp_final = df_tweets_pp_temp.merge(freq,
                                            how='left',
                                            left_on=0,
                                            right_on=['Topic'])

In [None]:
best_topics = [1, 5, 10, 17, 21]
df_tweets_pp_final = df_tweets_pp_final[df_tweets_pp_final[0].isin(best_topics)]

In [None]:
df_tweets_pp_final = df_tweets_pp_final.merge(users_df,
                                               on=['author_id'],
                                               how='left')

In [None]:
# Assuming your DataFrame is named 'df'
df_tweets_pp_final.rename(columns={'0': 'topic'}, inplace=True)

In [None]:
import ast
df_tweets_pp_final['tweet_metrics'] = df_tweets_pp_final['tweet_metrics'].apply(ast.literal_eval)

In [None]:
df_tweets_pp_final['retweet_count']     = df_tweets_pp_final['tweet_metrics'].apply(lambda x: x.get('retweet_count', 0))
df_tweets_pp_final['reply_count']       = df_tweets_pp_final['tweet_metrics'].apply(lambda x: x.get('reply_count', 0))
df_tweets_pp_final['like_count']        = df_tweets_pp_final['tweet_metrics'].apply(lambda x: x.get('like_count', 0))
df_tweets_pp_final['impression_count']  = df_tweets_pp_final['tweet_metrics'].apply(lambda x: x.get('impression_count', 0))

In [None]:
# Group by specified columns and aggregate metrics
df_tweets_grouped = df_tweets_pp_final.groupby(['author_id', 'username', 'followers_count', 'following_count', 'tweet_count', 'CustomName']).agg(
    total_tweets=('tweet_id', 'count'),
    total_retweets=('retweet_count', 'sum'),
    total_replies=('reply_count', 'sum'),
    total_likes=('like_count', 'sum'),
    total_impressions=('impression_count', 'sum')
).reset_index()

# Calculate average likes, impressions, and replies per tweet
df_tweets_grouped['avg_likes_per_tweet']        = df_tweets_grouped['total_likes'] / df_tweets_grouped['total_tweets']
df_tweets_grouped['avg_impressions_per_tweet']  = df_tweets_grouped['total_impressions'] / df_tweets_grouped['total_tweets']
df_tweets_grouped['avg_replies_per_tweet']      = df_tweets_grouped['total_replies'] / df_tweets_grouped['total_tweets']

In [None]:
# Define weights for each criterion
weights = {
    'Followers': 0.2,
    'Tweets': 0.2,
    'Avg_Likes_Per_Tweet': 0.25,
    'Avg_Impressions_Per_Tweet': 0.25,
    'Avg_Replies_Per_Tweet': 0.10
}

# Calculate a weighted score for each user
df_tweets_grouped['user_score'] = (
    weights['Followers'] * df_tweets_grouped['followers_count'] +
    weights['Tweets'] * df_tweets_grouped['total_tweets'] +
    weights['Avg_Likes_Per_Tweet'] * df_tweets_grouped['avg_likes_per_tweet'] +
    weights['Avg_Impressions_Per_Tweet'] * df_tweets_grouped['avg_impressions_per_tweet'] +
    weights['Avg_Replies_Per_Tweet'] * df_tweets_grouped['avg_replies_per_tweet']
)

# Sort users by topic and user_score
sorted_grouped = df_tweets_grouped.sort_values(by=['CustomName', 'user_score'], ascending=[True, False])

# Get the top 5 users per topic
top_users_per_topic = sorted_grouped.groupby('CustomName').head(5)

In [None]:
sorted_grouped[(sorted_grouped['total_tweets'] >= 3)]