# Preprocessing
1. Convert text to lowercase 
2. Remove URLs, mentions, and special characters (besides hashtags and emojis)
3. Remove stop words
4. Perform stemming/lemmatization

Download the spaCy model
Can be installed via `python -m spacy download en_core_web_sm`

In [83]:
import pandas as pd
import re
import spacy
from googletrans import Translator
from lingua import LanguageDetectorBuilder
from lingua import IsoCode639_1
import nest_asyncio
import asyncio
from lingua import IsoCode639_1

In [84]:
# Load the JSON file
df_posts = pd.read_json('../data/dataset.json')
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936


In [85]:
# Info this can take over 30m!!
# spacy.prefer_gpu()  # Prefers GPU but doesn't crash if unavailable
# nlp = spacy.load("en_core_web_trf")
# def extract_entities(text):
#     """
#     Extracts named entities from text using SpaCy's NER model.

#     Args:
#     text (str): The text from which to extract named entities.

#     Returns:
#     list: A list of tuples where each tuple contains (entity_text, entity_label).
#     """
#     if not text or pd.isna(text):
#         return []  # Return an empty list if text is missing

#     # Process text with SpaCy
#     doc = nlp(text)

#     # Extract entity text and labels
#     entities = [(ent.text, ent.label_) for ent in doc.ents]

#     return entities

# def preprocess_text(text):
#     """
#     Preprocesses text by removing URLs and emojis while keeping mentions and hashtags intact.

#     Args:
#     text (str): The original text.

#     Returns:
#     str: Preprocessed text.
#     """
#     if not text or pd.isna(text):
#         return ""  # Return empty string if text is missing

#     # Remove URLs
#     text = re.sub(r"http\S+|www\S+|https\S+", '', text)
#     # Remove emojis
#     text = emoji.replace_emoji(text, replace="")
    
#     return text.strip()

# # Create a preprocessed text column
# df_posts['preprocessed_text'] = df_posts['text'].apply(preprocess_text)

# # Apply NER extraction on the preprocessed text
# df_posts['entities'] = df_posts['preprocessed_text'].apply(extract_entities)

Check for rows with no text

In [86]:
# Display rows where 'text' is missing (NaN)
missing_text_rows = df_posts[df_posts['text'].isnull()]
missing_text_rows

Unnamed: 0,timestamp,text,text_id,user,user_id


### Convert variables

Move hashtags to a new column 'hashtags'

In [87]:
# Apply re.findall() to each row in the 'text' column to extract hashtags
df_posts['hashtags'] = df_posts['text'].apply(lambda x: re.findall(r'#\w+', x) if isinstance(x, str) else [])

Move mentions to a new column 'mentions'

In [88]:
# Extract mentions from the 'text' column, remove the '@' symbol, and create a new column 'mentions'
df_posts['mentions'] = df_posts['text'].apply(lambda x: [mention[1:] for mention in re.findall(r'@\w+', x)] if isinstance(x, str) else [])

Convert text to lowercase

In [89]:
df_posts['text'] = df_posts['text'].str.lower()

Remove Date from Timestamp

In [90]:
# All Dates are the same
unique_dates = df_posts['timestamp'].dt.date.unique()
unique_dates

array([datetime.date(2024, 10, 31)], dtype=object)

In [91]:
df_posts['timestamp'] = df_posts['timestamp'].dt.time
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,mentions
0,00:00:00,running a business means juggling countless ad...,2018569761,danielwoodard,1077866112,"[#HRtech, #businessmanagement]",[]
1,00:00:00,liz truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430,[#politics],[]
2,00:00:00,the uk is bracing for war as government buildi...,2059143248,ihooper,1007478642,"[#Ukrainewashed, #WarPreparedness]",[]
3,00:00:00,marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480,"[#FamilyTree, #GeneticFacts]",[]
4,00:00:00,it's truly disgraceful how the indian national...,2001239278,michael51,1021455936,[#RationChorCongress],[]


### Remove URLs, Mentions, and Special Characters

Leave apostrophes in here for better lemmatization performance

In [92]:
# Pre-compile regex patterns
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags
    u"\U00002700-\U000027BF"  # dingbats
    u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
    u"\U00002600-\U000026FF"  # miscellaneous symbols
    u"\U00002B50-\U00002B55"  # stars
    "]+", flags=re.UNICODE)

url_pattern = re.compile(r'http\S+|www\S+|https\S+')  # Removes URLs
mention_pattern = re.compile(r'@\w+')  # Removes mentions
punctuation_pattern = re.compile(r"[^\w\s'’]")  # Removes punctuation but keeps apostrophes
number_pattern = re.compile(r'\d+')  # Removes numbers
whitespace_pattern = re.compile(r'\s+')  # Removes excessive whitespace
hashtag_pattern = re.compile(r'#\w+')  # Removes hashtags and all text after them

# Removes spaces between letters in a single word
letter_spacing_pattern = re.compile(r'(\b\w)(?:\s+)(\w\b)')

# List of terms to remove
remove_tw_terms = ["cc", "cx", "ct", "dm", "ht", "mt", "prt", "rt", "followback", "follow back", "fb", "retweet", "retweets"]

# Compile regex to match terms (case insensitive and whole word)
remove_terms_pattern = re.compile(r'\b(' + '|'.join(remove_tw_terms) + r')\b')

# Updated regex for matching spaced-out letters (e.g., "s h a r e")
letter_spacing_pattern = re.compile(r'(\b(?:\w\s)+\w\b)')

# Function to merge spaced-out letters
def merge_spaced_letters(match):
    # Remove spaces within the matched group
    return match.group(0).replace(' ', '')

def normalize_full_width(text):
    # Convert full-width characters to half-width
    return ''.join(
        chr(ord(char) - 0xFEE0) if 0xFF01 <= ord(char) <= 0xFF5E else char
        for char in text
    )

# Updated preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return "", []  # Handle missing values gracefully
    
    # Normalize full-width characters
    text = normalize_full_width(text)

    # Extract emojis
    emojis = emoji_pattern.findall(text)  # List of emojis

    # Remove hashtags and text following them
    text = hashtag_pattern.sub('', text)

    # Remove emojis, URLs, mentions, punctuation, and numbers
    text = emoji_pattern.sub('', text)  # Remove emojis
    text = url_pattern.sub('', text)  # Remove URLs
    text = mention_pattern.sub('', text)  # Remove mentions
    text = punctuation_pattern.sub('', text)  # Remove punctuation
    text = number_pattern.sub('', text)  # Remove numbers

    # Remove specific terms (CC, CX, CT, DM, etc.)
    text = remove_terms_pattern.sub('', text)

    # Normalize letter spacing (e.g., "s h a r e" -> "share")
    text = letter_spacing_pattern.sub(merge_spaced_letters, text)

    # Remove excessive whitespace and trim
    text = whitespace_pattern.sub(' ', text).strip()

    return text, emojis

# Apply preprocessing to create new columns
df_posts[['text', 'emojis']] = df_posts['text'].apply(lambda x: pd.Series(preprocess_text(x)))

# Display head to check the results
df_posts[['text', 'emojis']].head()


coroutine 'translate_all_tweets' was never awaited



Unnamed: 0,text,emojis
0,running a business means juggling countless ad...,[]
1,liz truss is walking in the lingering shadow o...,[]
2,the uk is bracing for war as government buildi...,[🇺🇦]
3,marrying a second or third cousin once removed...,[🧬]
4,it's truly disgraceful how the indian national...,"[🤦, ♂]"


## Check for duplicates

In [93]:
# Total duplicate rows
total_duplicate_rows = df_posts['text'].duplicated(keep=False).sum()

# Number of unique duplicate tweets
unique_duplicate_tweets = (df_posts['text'].value_counts() > 1).sum()

print(f"Total duplicate rows: {total_duplicate_rows}")
print(f"Unique duplicate tweets: {unique_duplicate_tweets}")

Total duplicate rows: 40239
Unique duplicate tweets: 16405


# Duplicate stats

In [94]:
# Get the frequency distribution of tweets
frequency_distribution = df_posts['text'].value_counts()

# Filter for only duplicates (frequency > 1)
duplicate_tweet_frequencies = frequency_distribution[frequency_distribution > 1]

# Summary statistics
print(duplicate_tweet_frequencies.describe())

# How many tweets are repeated more than 5 times?
highly_duplicated = (duplicate_tweet_frequencies > 5).sum()
print(f"Number of tweets repeated more than 5 times: {highly_duplicated}")

count    16405.000000
mean         2.452850
std          2.188017
min          2.000000
25%          2.000000
50%          2.000000
75%          3.000000
max        130.000000
Name: count, dtype: float64
Number of tweets repeated more than 5 times: 150


Remove duplicate tweets only if the same person posted the same tweet (spam) but extract frequency

In [95]:
# Step 1: Group by 'user' and 'text' and calculate the frequency of each combination
df_posts['frequency'] = df_posts.groupby(['user', 'text'])['text'].transform('count')

# Step 2: Drop duplicates based on 'user' and 'text' (keeping the first occurrence)
df_posts = df_posts.drop_duplicates(subset=['user', 'text'], keep='first').reset_index(drop=True)

In [96]:
df_posts[df_posts['user'] == 'reginabarnes']

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,mentions,emojis,frequency
48,00:00:19,wwf no mercy usa variant nintendo n cart only ...,2053226516,reginabarnes,1068051022,[],[],[],86


Indentify language

In [97]:
# Initialize the Lingua language detector for all languages
detector = LanguageDetectorBuilder.from_all_languages().build()

# Function to detect language using Lingua
def detect_language_lingua(text):
    # Skip tweets with fewer than 3 words
    if len(text.split()) < 3:
        return None  # Skip these tweets entirely
    try:
        language = detector.detect_language_of(text)
        return language.iso_code_639_1 if language else None  # Return language code or None
    except Exception as e:
        return None  # Skip on exception

# Apply language detection to the 'text' column
df_posts['language'] = df_posts['text'].apply(detect_language_lingua)

# Convert language column to string representation (e.g., "EN" instead of IsoCode639_1.EN)
df_posts['language'] = df_posts['language'].apply(lambda lang: lang.name if lang is not None else None)

# Filter non-English tweets (ignoring None values)
non_english_tweets = df_posts[
    (df_posts['language'].notna()) & (df_posts['language'] != 'EN')
]

# Filter to keep English tweets AND tweets with None language
df_posts = df_posts[
    (df_posts['language'] == 'EN') | (df_posts['language'].isna())
].copy()

# Display diagnostic information
print(f"Total tweets after filtering: {len(df_posts)}")
print(f"English tweets: {len(df_posts[df_posts['language'] == 'EN'])}")
print(f"Tweets with None language: {df_posts['language'].isna().sum()}")
print(f"Number of non-English tweets: {len(non_english_tweets)}")

Total tweets after filtering: 67405
English tweets: 67127
Tweets with None language: 278
Number of non-English tweets: 1131


In [98]:
# Save the non-English tweets to a CSV file
non_english_tweets.to_csv('../output/non_english_tweets.csv', index=False)

In [99]:
print(df_posts['language'].unique())

['EN' None]


Count the number of appearances of each Language

In [100]:
language_counts = non_english_tweets['language'].value_counts()
print("Number of tweets per non-english language:")
language_counts

Number of tweets per non-english language:


language
PT    522
ES    139
TL    100
SW     40
NL     32
YO     23
TH     22
DE     21
LA     20
ID     20
CA     13
CY     12
DA     11
SO     11
NB     10
AF     10
FI     10
SQ      9
SV      9
IT      8
FR      8
ET      8
TS      7
NN      6
ST      6
EU      5
XH      5
EO      5
SN      4
TR      4
TN      4
AR      3
KO      3
RO      3
CS      3
HI      2
MS      2
SK      1
LT      1
ZU      1
BN      1
LG      1
TA      1
JA      1
SL      1
GA      1
MR      1
MI      1
Name: count, dtype: int64

Translate the tweets

In [102]:
import nest_asyncio
nest_asyncio.apply()

# Initialize translator
translator = Translator()

# Define languages to translate
languages_to_translate = ['PT', 'ES', 'TL']

# Filter tweets in the selected languages
translated_tweets = non_english_tweets[non_english_tweets['language'].isin(languages_to_translate)].copy()

# Function to translate text
async def translate_text(text):
    try:
        result = await translator.translate(text, dest='en')
        return result.text
    except Exception as e:
        return text  # Return original text if translation fails

# Async function to translate all tweets
async def translate_all_tweets():
    translated_tweets['text'] = await asyncio.gather(
        *[translate_text(text) for text in translated_tweets['text']]
    )
    
    # Save to CSV
    translated_tweets.to_csv('../output/translated_tweets.csv', index=False)
    print(f"Number of tweets translated: {len(translated_tweets)}")

# Get the current event loop
loop = asyncio.get_event_loop()

# Now you can use asyncio.run()
asyncio.run(translate_all_tweets())

Number of tweets translated: 761


In [103]:
translated_tweets

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,mentions,emojis,frequency,language
8,00:00:00,sunshine sunshine sunshine sunshine sunshine s...,2055945627,andrea21,1080017323,"[#GoodVibes, #SummerFeels, #EndlessJoy]",[jonathanreynolds],"[☀, ☀, ☀, 💛]",1,TL
187,00:02:01,Always vote for the tag and let it decide the ...,2020582097,alvin24,1016376595,"[#VoteAriana, #BBMAs2024, #MusicLovers]","[kaitlinlee, randy35]","[😍🔥, 🎤✨]",1,PT
240,00:02:58,Sapang Dalaga MPPS PNP personnel led by Office...,2022877822,leah53,1036411239,"[#CommunityEngagement, #PNP, #SafetyFirst]",[],[],1,TL
252,00:03:10,"What's the news friends, special surprise is c...",2035146367,vriddle,1061691677,[#KabirSingh],"[uallison, wcarter, matthew45, wayne59]",[🎉],1,TL
256,00:03:16,although I can no longer vote on other platfor...,2075428124,amyweaver,1020527420,"[#VMAs, #BLACKPINK, #GirlPower]",[lauren57],"[💖, 🎉✨]",1,ES
...,...,...,...,...,...,...,...,...,...,...
68269,23:52:50,triple vibes i am voting for bts for top socia...,2016655749,kbrown,1029276895,"[#BBMAs, #BTSBBMAs]",[dsmith],"[✨, ❤]",1,PT
68273,23:52:56,thousand and jungkook will sing on your birthd...,2076418073,james39,1039700711,"[#AMAs, #BTS, #ArmyForever]",[jocelynjones],"[🎉, 💜]",1,PT
68387,23:56:04,If this tweet reaches Jungkook he will do a da...,2029973523,erikwhite,1087487919,"[#AMAs, #BTSAMAs]","[yschwartz, yhawkins]",[💜],1,PT
68451,23:57:38,"Voting is on, vote for the hashtag and listen ...",2005853368,kellygabriel,1053558936,"[#StrayKids, #MAMAs2023]","[brauncarrie, renee24]",[💖🔥],1,PT


Append translated tweets to english tweets dataframe

In [75]:
df_posts = pd.concat([df_posts, translated_tweets], ignore_index=True)

print(f"Final number of tweets: {len(df_posts)}")

Final number of tweets: 68166


Export for Topic Classification

In [76]:
df_posts.to_csv('../output/export_for_topic_classification.csv', index=False)

## Use spacy

Load en_core_web_sm for spacy

In [104]:
nlp = spacy.load("en_core_web_sm")

### Lemmatization

Use lemmatization since stemming can lead to less accurate results (even non-words)

In [105]:
# Info this takes about 4.5m!!
df_posts['text'] = df_posts['text'].apply(
        lambda text: ' '.join([token.lemma_ for token in nlp(text)])
)

### Create output for sentiment analysis

Keep stopwords

In [106]:
df_posts.to_csv('../output/preprocessed_for_SA.csv', index=False)

### Remove stopwords

In [107]:
df_posts['text'] = df_posts['text'].fillna("").apply(
    lambda text: ' '.join([token.text for token in nlp.make_doc(text) if not token.is_stop])
)

### Remove Apostrophes

In [108]:
df_posts['text'] = df_posts['text'].str.replace(r"[’']", "", regex=True)

### Check again for empty rows

In [109]:
# Display rows where 'text' is missing (NaN)
missing_text_rows = df_posts[df_posts['text'].isnull()]
missing_text_rows

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,mentions,emojis,frequency,language


### Write the output to csv file

In [110]:
df_posts.to_csv('../output/preprocessed.csv', index=False)