# Preprocessing
1. Convert text to lowercase 
2. Remove URLs, mentions, and special characters (besides hashtags and emojis)
3. Remove stop words
4. Perform stemming/lemmatization

Download the spaCy model
Can be installed via `python -m spacy download en_core_web_sm`

Download Pre-Trained-Language Model: `wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin`

In [1]:
import pandas as pd
import re
import spacy
import fasttext
from googletrans import Translator

In [2]:
# Load the JSON file
df_posts = pd.read_json('../data/dataset.json')
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936


In [21]:
# Info this can take over 30m!!
# spacy.prefer_gpu()  # Prefers GPU but doesn't crash if unavailable
# nlp = spacy.load("en_core_web_trf")
# def extract_entities(text):
#     """
#     Extracts named entities from text using SpaCy's NER model.

#     Args:
#     text (str): The text from which to extract named entities.

#     Returns:
#     list: A list of tuples where each tuple contains (entity_text, entity_label).
#     """
#     if not text or pd.isna(text):
#         return []  # Return an empty list if text is missing

#     # Process text with SpaCy
#     doc = nlp(text)

#     # Extract entity text and labels
#     entities = [(ent.text, ent.label_) for ent in doc.ents]

#     return entities

# def preprocess_text(text):
#     """
#     Preprocesses text by removing URLs and emojis while keeping mentions and hashtags intact.

#     Args:
#     text (str): The original text.

#     Returns:
#     str: Preprocessed text.
#     """
#     if not text or pd.isna(text):
#         return ""  # Return empty string if text is missing

#     # Remove URLs
#     text = re.sub(r"http\S+|www\S+|https\S+", '', text)
#     # Remove emojis
#     text = emoji.replace_emoji(text, replace="")
    
#     return text.strip()

# # Create a preprocessed text column
# df_posts['preprocessed_text'] = df_posts['text'].apply(preprocess_text)

# # Apply NER extraction on the preprocessed text
# df_posts['entities'] = df_posts['preprocessed_text'].apply(extract_entities)

Check for rows with no text

In [3]:
# Display rows where 'text' is missing (NaN)
missing_text_rows = df_posts[df_posts['text'].isnull()]
missing_text_rows

Unnamed: 0,timestamp,text,text_id,user,user_id


### Convert variables

Move hashtags to a new column 'hashtags'

In [4]:
# Apply re.findall() to each row in the 'text' column to extract hashtags
df_posts['hashtags'] = df_posts['text'].apply(lambda x: re.findall(r'#\w+', x) if isinstance(x, str) else [])

Move mentions to a new column 'mentions'

In [5]:
# Extract mentions from the 'text' column, remove the '@' symbol, and create a new column 'mentions'
df_posts['mentions'] = df_posts['text'].apply(lambda x: [mention[1:] for mention in re.findall(r'@\w+', x)] if isinstance(x, str) else [])

Convert text to lowercase

In [6]:
df_posts['text'] = df_posts['text'].str.lower()

Remove Date from Timestamp

In [7]:
# All Dates are the same
unique_dates = df_posts['timestamp'].dt.date.unique()
unique_dates

array([datetime.date(2024, 10, 31)], dtype=object)

In [8]:
df_posts['timestamp'] = df_posts['timestamp'].dt.time
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,mentions
0,00:00:00,running a business means juggling countless ad...,2018569761,danielwoodard,1077866112,"[#HRtech, #businessmanagement]",[]
1,00:00:00,liz truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430,[#politics],[]
2,00:00:00,the uk is bracing for war as government buildi...,2059143248,ihooper,1007478642,"[#Ukrainewashed, #WarPreparedness]",[]
3,00:00:00,marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480,"[#FamilyTree, #GeneticFacts]",[]
4,00:00:00,it's truly disgraceful how the indian national...,2001239278,michael51,1021455936,[#RationChorCongress],[]


### Remove URLs, Mentions, and Special Characters

Leave apostrophes in here for better lemmatization performance

In [9]:
# Pre-compile regex patterns
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags
    u"\U00002700-\U000027BF"  # dingbats
    u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
    u"\U00002600-\U000026FF"  # miscellaneous symbols
    u"\U00002B50-\U00002B55"  # stars
    "]+", flags=re.UNICODE)

url_pattern = re.compile(r'http\S+|www\S+|https\S+')  # Removes URLs
mention_pattern = re.compile(r'@\w+')  # Removes mentions
punctuation_pattern = re.compile(r"[^\w\s'’]")  # Removes punctuation but keeps apostrophes
number_pattern = re.compile(r'\d+')  # Removes numbers
whitespace_pattern = re.compile(r'\s+')  # Removes excessive whitespace
hashtag_pattern = re.compile(r'#\w+')  # Removes hashtags and all text after them

# Removes spaces between letters in a single word
letter_spacing_pattern = re.compile(r'(\b\w)(?:\s+)(\w\b)')

# List of terms to remove
remove_tw_terms = ["cc", "cx", "ct", "dm", "ht", "mt", "prt", "rt", "followback", "follow back", "fb", "retweet", "retweets"]

# Compile regex to match terms (case insensitive and whole word)
remove_terms_pattern = re.compile(r'\b(' + '|'.join(remove_tw_terms) + r')\b')

# Updated regex for matching spaced-out letters (e.g., "s h a r e")
letter_spacing_pattern = re.compile(r'(\b(?:\w\s)+\w\b)')

# Function to merge spaced-out letters
def merge_spaced_letters(match):
    # Remove spaces within the matched group
    return match.group(0).replace(' ', '')

def normalize_full_width(text):
    # Convert full-width characters to half-width
    return ''.join(
        chr(ord(char) - 0xFEE0) if 0xFF01 <= ord(char) <= 0xFF5E else char
        for char in text
    )

# Updated preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return "", []  # Handle missing values gracefully
    
    # Normalize full-width characters
    text = normalize_full_width(text)

    # Extract emojis
    emojis = emoji_pattern.findall(text)  # List of emojis

    # Remove hashtags and text following them
    text = hashtag_pattern.sub('', text)

    # Remove emojis, URLs, mentions, punctuation, and numbers
    text = emoji_pattern.sub('', text)  # Remove emojis
    text = url_pattern.sub('', text)  # Remove URLs
    text = mention_pattern.sub('', text)  # Remove mentions
    text = punctuation_pattern.sub('', text)  # Remove punctuation
    text = number_pattern.sub('', text)  # Remove numbers

    # Remove specific terms (CC, CX, CT, DM, etc.)
    text = remove_terms_pattern.sub('', text)

    # Normalize letter spacing (e.g., "s h a r e" -> "share")
    text = letter_spacing_pattern.sub(merge_spaced_letters, text)

    # Remove excessive whitespace and trim
    text = whitespace_pattern.sub(' ', text).strip()

    return text, emojis

# Apply preprocessing to create new columns
df_posts[['text', 'emojis']] = df_posts['text'].apply(lambda x: pd.Series(preprocess_text(x)))

# Display head to check the results
df_posts[['text', 'emojis']].head()

Unnamed: 0,text,emojis
0,running a business means juggling countless ad...,[]
1,liz truss is walking in the lingering shadow o...,[]
2,the uk is bracing for war as government buildi...,[🇺🇦]
3,marrying a second or third cousin once removed...,[🧬]
4,it's truly disgraceful how the indian national...,"[🤦, ♂]"


Test output

In [11]:
df_posts.to_csv('../output/testing.csv', index=False)

## Check for duplicates

In [None]:
# Total duplicate rows
total_duplicate_rows = df_posts['text'].duplicated(keep=False).sum()

# Number of unique duplicate tweets
unique_duplicate_tweets = (df_posts['text'].value_counts() > 1).sum()

print(f"Total duplicate rows: {total_duplicate_rows}")
print(f"Unique duplicate tweets: {unique_duplicate_tweets}")

# Duplicate stats

In [None]:
# Get the frequency distribution of tweets
frequency_distribution = df_posts['text'].value_counts()

# Filter for only duplicates (frequency > 1)
duplicate_tweet_frequencies = frequency_distribution[frequency_distribution > 1]

# Summary statistics
print(duplicate_tweet_frequencies.describe())

# How many tweets are repeated more than 5 times?
highly_duplicated = (duplicate_tweet_frequencies > 5).sum()
print(f"Number of tweets repeated more than 5 times: {highly_duplicated}")

Remove duplicate tweets only if the same person posted the same tweet (spam) but extract frequency

In [14]:
# Step 1: Group by 'user' and 'text' and calculate the frequency of each combination
df_posts['frequency'] = df_posts.groupby(['user', 'text'])['text'].transform('count')

# Step 2: Drop duplicates based on 'user' and 'text' (keeping the first occurrence)
df_posts = df_posts.drop_duplicates(subset=['user', 'text'], keep='first').reset_index(drop=True)

In [None]:
df_posts[df_posts['user'] == 'reginabarnes']

Indentify language

In [None]:
# Load the pre-trained language identification model
model = fasttext.load_model('../models/lid.176.bin')

# Function to detect language using FastText
def detect_language_fasttext(text, threshold=0.8):
    # Skip tweets with fewer than 3 words
    if len(text.split()) < 3:
        return None  # Skip these tweets entirely
    try:
        predictions = model.predict(text)  # Predict the language
        lang_code = predictions[0][0].replace("__label__", "")  # Extract language code
        return lang_code
    except Exception as e:
        return None  # Skip on exception

# Apply language detection to the 'text' column
df_posts['language'] = df_posts['text'].apply(detect_language_fasttext)

# Filter non-English tweets (ignoring None values)
non_english_tweets = df_posts[(df_posts['language'].notna()) & (df_posts['language'] != 'en')]

# Filter to keep English tweets AND tweets with None language
df_posts = df_posts[(df_posts['language'] == 'en') | (df_posts['language'].isna())].copy()

# Print diagnostic information
print(f"Total tweets after filtering: {len(df_posts)}")
print(f"English tweets: {len(df_posts[df_posts['language'] == 'en'])}")
print(f"Tweets with None language: {df_posts['language'].isna().sum()}")
print(f"Number of non-English tweets: {len(non_english_tweets)}")

In [17]:
# Save the non-English tweets to a CSV file
non_english_tweets.to_csv('../output/non_english_tweets.csv', index=False)

Count the number of appearances of each Language

In [None]:
language_counts = non_english_tweets['language'].value_counts()

print("Number of tweets per non-english language:")
print(language_counts)

Translate the tweets

In [None]:
# Initialize translator
translator = Translator()

languages_to_translate = ['pt', 'es', 'th', 'id']  # List of language codes
translated_tweets = non_english_tweets[non_english_tweets['language'].isin(languages_to_translate)]

# Function to translate text
def translate_text(text):
    try:
        return translator.translate(text, dest='en').text
    except Exception as e:
        return text  # Return original text if translation fails

# Translate the tweets in the filtered DataFrame
translated_tweets['translated_text'] = translated_tweets['text'].apply(translate_text)

# Save the translated tweets to a CSV file
translated_tweets.to_csv('../output/translated_tweets.csv', index=False)

# Display the number of translated tweets
print(f"Number of tweets translated: {len(translated_tweets)}")

Append translated tweets to english tweets dataframe

In [None]:
df_posts = pd.concat([df_posts, translated_tweets], ignore_index=True)

print(f"Final number of tweets: {len(df_posts)}")

## Use spacy

Load en_core_web_sm for spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

### Lemmatization

Use lemmatization since stemming can lead to less accurate results (even non-words)

In [None]:
# Info this takes about 2.5m!!
df_posts['text'] = df_posts['text'].apply(
        lambda text: ' '.join([token.lemma_ for token in nlp(text)])
)

### Create output for sentiment analysis

Keep stopwords

In [19]:
df_posts.to_csv('../output/preprocessed_for_SA.csv', index=False)

### Remove stopwords

In [20]:
df_posts['text'] = df_posts['text'].fillna("").apply(
    lambda text: ' '.join([token.text for token in nlp.make_doc(text) if not token.is_stop])
)

### Remove Apostrophes

In [21]:
df_posts['text'] = df_posts['text'].str.replace(r"[’']", "", regex=True)

### Check again for empty rows

In [None]:
# Display rows where 'text' is missing (NaN)
missing_text_rows = df_posts[df_posts['text'].isnull()]
missing_text_rows

### Write the output to csv file

In [23]:
df_posts.to_csv('../output/preprocessed.csv', index=False)

In [None]:
df_posts[df_posts['frequency'] > 85]

Topic classification with BERTopic

Install transformers 4.41.0 for compatability with spacy and BERTopic

`pip install transformers==4.41.0` 

Careful when running this, very hardware intensive

In [16]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import psutil
import gc
from tqdm import tqdm

def monitor_memory():
    """Monitor memory usage"""
    process = psutil.Process()
    memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
    return f"Memory Usage: {memory_gb:.2f} GB"

def create_multifeature_embeddings(df_posts, sentence_model, batch_size=64):
    """
    Create combined embeddings with memory monitoring and larger batches
    """
    print(f"\nStarting embedding generation for {len(df_posts)} documents")
    print(monitor_memory())
    
    def safe_join(items):
        if not items or (isinstance(items, list) and len(items) == 0):
            return ""
        return " ".join(str(item) for item in items)
    
    print("\nGenerating text embeddings...")
    text_embeddings = sentence_model.encode(
        df_posts['text'].fillna("").tolist(),
        batch_size=batch_size,
        show_progress_bar=True
    )
    print(monitor_memory())
    
    print("\nGenerating hashtag embeddings...")
    hashtag_embeddings = sentence_model.encode(
        [safe_join(tags) for tags in df_posts['hashtags']],
        batch_size=batch_size,
        show_progress_bar=True
    )
    print(monitor_memory())
    
    print("\nGenerating emoji embeddings...")
    emoji_embeddings = sentence_model.encode(
        [safe_join(emojis) for emojis in df_posts['emojis']],
        batch_size=batch_size,
        show_progress_bar=True
    )
    print(monitor_memory())
    
    print("\nCombining embeddings...")
    combined_embeddings = (
        0.97 * text_embeddings + 
        0.02 * hashtag_embeddings + 
        0.01 * emoji_embeddings
    )
    
    # Clean up to free memory
    del text_embeddings, hashtag_embeddings, emoji_embeddings
    gc.collect()
    
    print(monitor_memory())
    return combined_embeddings

def setup_bertopic_model(df_posts, batch_size=64):
    """
    Set up and train BERTopic model with memory optimization
    """
    print(f"\nDataset size: {len(df_posts)} documents")
    print(f"DataFrame memory usage: {df_posts.memory_usage().sum() / 1024**2:.2f} MB")
    print(monitor_memory())
    
    print("\nInitializing models...")
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Modified parameters for large dataset
    vectorizer_model = CountVectorizer(
        stop_words="english",
        #min_df=5,  # Increased from 10 for larger dataset
        #max_df=0.9,  # Ignore terms that appear in >80% of docs
        ngram_range=(1, 2)
    )
    
    # Optimized UMAP settings for large dataset
    umap_model = UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.0,
        metric='cosine',
        low_memory=True,
        random_state=42
    )
    
    # Create embeddings
    embeddings = create_multifeature_embeddings(df_posts, sentence_model, batch_size)
    
    # Initialize BERTopic with optimized settings
    topic_model = BERTopic(
        embedding_model=sentence_model,
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        min_topic_size=30,  # Increased for larger dataset
        nr_topics='auto',
        calculate_probabilities=True,
        verbose=True
    )
    
    print("\nFitting BERTopic model...")
    topics, probs = topic_model.fit_transform(
        documents=df_posts['text'].fillna("").tolist(),
        embeddings=embeddings
    )
    
    return topic_model, topics, probs

def analyze_topics(topic_model, topics, df_posts):
    """
    Analyze topics with memory considerations
    """
    print("\nAnalyzing topics...")
    print(monitor_memory())
    
    topic_info = topic_model.get_topic_info()
    
    # More memory-efficient way to store topics
    df_posts['topic'] = topics
    
    # Get sample documents for each topic (limited to save memory)
    topic_docs = {}
    unique_topics = set(topics)
    print(f"\nFound {len(unique_topics)-1} topics (excluding -1)")
    
    for topic in tqdm(unique_topics):
        if topic != -1:
            topic_docs[topic] = df_posts[df_posts['topic'] == topic]['text'].head(3).tolist()
    
    try:
        print("\nGenerating visualizations...")
        topic_model.visualize_topics()
        topic_model.visualize_hierarchy()
    except Exception as e:
        print(f"Warning: Visualization error: {e}")
    
    return topic_info, topic_docs

def run_topic_analysis(df_posts, batch_size=64):
    """
    Run the complete pipeline with memory monitoring
    """
    print(f"Starting analysis with batch size: {batch_size}")
    print(monitor_memory())
    
    required_columns = ['text', 'hashtags', 'emojis']
    if not all(col in df_posts.columns for col in required_columns):
        raise ValueError(f"Missing columns. Required: {required_columns}")
    
    # Optionally sample for testing
    df_posts = df_posts.sample(n=10000, random_state=42)  # Uncomment to test with sample
    
    topic_model, topics, probs = setup_bertopic_model(df_posts, batch_size)
    topic_info, topic_docs = analyze_topics(topic_model, topics, df_posts)
    
    # Create memory-efficient summary
    summary = {
        'num_topics': len(set(topics)) - 1,
        'topic_sizes': topic_info['Count'].tolist(),
        'top_topics': topic_info.head(10).to_dict('records')
    }
    
    return topic_model, summary

In [17]:
# Then run:
try:
    # You can adjust batch size based on your memory
    topic_model, summary = run_topic_analysis(df_posts, batch_size=64)
    
    print(f"\nAnalysis complete!")
    print(f"Found {summary['num_topics']} topics")
    print("\nTop 10 topics:")
    for topic in summary['top_topics']:
        print(f"Topic {topic['Topic']}: Size {topic['Count']}")
    
    # Save model if needed
    topic_model.save("bertopic_model_large")
    
except Exception as e:
    print(f"Error during analysis: {e}")

Starting analysis with batch size: 64
Memory Usage: 0.42 GB

Dataset size: 10000 documents
DataFrame memory usage: 0.69 MB
Memory Usage: 0.48 GB

Initializing models...

Starting embedding generation for 10000 documents
Memory Usage: 0.54 GB

Generating text embeddings...


Batches: 100%|██████████| 157/157 [00:07<00:00, 20.94it/s]


Memory Usage: 0.89 GB

Generating hashtag embeddings...


Batches: 100%|██████████| 157/157 [00:03<00:00, 44.05it/s]


Memory Usage: 0.98 GB

Generating emoji embeddings...


Batches: 100%|██████████| 157/157 [00:01<00:00, 104.92it/s]


Memory Usage: 1.05 GB

Combining embeddings...


2025-01-21 23:51:42,469 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Memory Usage: 1.27 GB

Fitting BERTopic model...


2025-01-21 23:51:52,253 - BERTopic - Dimensionality - Completed ✓
2025-01-21 23:51:52,253 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-21 23:51:52,833 - BERTopic - Cluster - Completed ✓
2025-01-21 23:51:52,833 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-21 23:51:53,065 - BERTopic - Representation - Completed ✓
2025-01-21 23:51:53,066 - BERTopic - Topic reduction - Reducing number of topics
2025-01-21 23:51:53,290 - BERTopic - Topic reduction - Reduced number of topics from 43 to 27



Analyzing topics...
Memory Usage: 1.39 GB

Found 26 topics (excluding -1)


100%|██████████| 27/27 [00:00<00:00, 7150.28it/s]



Generating visualizations...

Analysis complete!
Found 26 topics

Top 10 topics:
Topic -1: Size 3535
Topic 0: Size 1553
Topic 1: Size 930
Topic 2: Size 799
Topic 3: Size 768
Topic 4: Size 257
Topic 5: Size 235
Topic 6: Size 230
Topic 7: Size 218
Topic 8: Size 196


In [18]:
# 1. Save visualizations to HTML files
fig = topic_model.visualize_barchart(top_n_topics=10)
fig.write_html("../output/topic_barchart.html")

topic_model.visualize_topics().write_html("../output/topic_clusters.html")
topic_model.visualize_hierarchy().write_html("../output/topic_hierarchy.html")

# 2. Print text-based summary
topics_info = topic_model.get_topic_info()
print("\nMost frequent topics with their terms:")
for _, row in topics_info.head(10).iterrows():
    topic_id = row['Topic']
    size = row['Count']
    if topic_id != -1:
        terms = topic_model.get_topic(topic_id)
        print(f"\nTopic {topic_id} (Size: {size}):")
        # Print top 10 terms for each topic with their weights
        for term, weight in terms[:10]:
            print(f"  - {term}: {weight:.3f}")


Most frequent topics with their terms:

Topic 0 (Size: 1553):
  - business: 0.050
  - check: 0.016
  - learn: 0.011
  - marketing: 0.010
  - success: 0.010
  - looking: 0.010
  - don: 0.010
  - online: 0.010
  - new: 0.009
  - join: 0.009

Topic 1 (Size: 930):
  - politics: 0.032
  - political: 0.018
  - truss: 0.014
  - people: 0.012
  - time: 0.012
  - liz truss: 0.011
  - liz: 0.011
  - let: 0.010
  - like: 0.010
  - just: 0.010

Topic 2 (Size: 799):
  - vote: 0.079
  - artist: 0.067
  - social artist: 0.057
  - social: 0.053
  - voting: 0.049
  - let: 0.046
  - army: 0.034
  - love: 0.033
  - make: 0.032
  - counts: 0.030

Topic 3 (Size: 768):
  - health: 0.040
  - mental: 0.038
  - mental health: 0.036
  - care: 0.013
  - healthcare: 0.013
  - medical: 0.011
  - need: 0.011
  - support: 0.011
  - impact: 0.010
  - energy: 0.010

Topic 4 (Size: 257):
  - ready: 0.032
  - wait: 0.030
  - album: 0.027
  - days: 0.025
  - new: 0.025
  - music: 0.023
  - new album: 0.021
  - amp: 0.02