# Preprocessing
1. Convert text to lowercase 
2. Remove URLs, mentions, and special characters (besides hashtags and emojis)
3. Remove stop words
4. Perform stemming/lemmatization
5. Tokenize text

In [5]:
import pandas as pd
import re
import spacy
nlp = spacy.load('en_core_web_lg')

In [6]:
# Load the JSON file
df_posts = pd.read_json('../dataset.json')

df_posts

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31 00:00:00,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31 00:00:00,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31 00:00:00,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31 00:00:00,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31 00:00:00,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936
...,...,...,...,...,...
70255,2024-10-31 23:59:52,"@bakerjulie: ""Saranghae, I’m your resident K-P...",2086649509,valdezjennifer,1094330726
70256,2024-10-31 23:59:52,Soaring to new heights with @sweeneyanthony! T...,2039889186,nashshaun,1015245531
70257,2024-10-31 23:59:54,"Hey @james20 @paul47, did you see the latest f...",2020468196,brownregina,1029384492
70258,2024-10-31 23:59:58,Check out the fundraiser exhibition by @joanna...,2037744299,dkey,1046050046


Check for rows with no text

In [7]:
# Display rows where 'text' is missing (NaN)
missing_text_rows = df_posts[df_posts['text'].isnull()]

# Display the DataFrame with missing text
print(missing_text_rows)

Empty DataFrame
Columns: [timestamp, text, text_id, user, user_id]
Index: []


## Remove Hashtags to preserve casings

In [8]:
# Apply re.findall() to each row in the 'text' column to extract hashtags
df_posts['hashtags'] = df_posts['text'].apply(lambda x: re.findall(r'#\w+', x) if isinstance(x, str) else [])

## Convert text to lowercase

In [9]:
df_posts['text'] = df_posts['text'].str.lower()
df_posts

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags
0,2024-10-31 00:00:00,running a business means juggling countless ad...,2018569761,danielwoodard,1077866112,"[#HRtech, #businessmanagement]"
1,2024-10-31 00:00:00,liz truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430,[#politics]
2,2024-10-31 00:00:00,the uk is bracing for war as government buildi...,2059143248,ihooper,1007478642,"[#Ukrainewashed, #WarPreparedness]"
3,2024-10-31 00:00:00,marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480,"[#FamilyTree, #GeneticFacts]"
4,2024-10-31 00:00:00,it's truly disgraceful how the indian national...,2001239278,michael51,1021455936,[#RationChorCongress]
...,...,...,...,...,...,...
70255,2024-10-31 23:59:52,"@bakerjulie: ""saranghae, i’m your resident k-p...",2086649509,valdezjennifer,1094330726,"[#KPopArt, #StayCreative]"
70256,2024-10-31 23:59:52,soaring to new heights with @sweeneyanthony! t...,2039889186,nashshaun,1015245531,"[#caesarstonesa, #designinspiration, #modernin..."
70257,2024-10-31 23:59:54,"hey @james20 @paul47, did you see the latest f...",2020468196,brownregina,1029384492,"[#AXA, #truthprevails]"
70258,2024-10-31 23:59:58,check out the fundraiser exhibition by @joanna...,2037744299,dkey,1046050046,"[#artandmentalhealth, #hauserwirth, #hospitalr..."


## Remove URLs, Mentions, and Special Characters

In [10]:
# Pre-compile regex patterns
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags
    u"\U00002700-\U000027BF"  # dingbats
    u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
    u"\U00002600-\U000026FF"  # miscellaneous symbols
    u"\U00002B50-\U00002B55"  # stars
    "]+", flags=re.UNICODE)

url_pattern = re.compile(r'http\S+|www\S+|https\S+')
mention_pattern = re.compile(r'@(\w+)')  # Removes "@" but keeps the mention
punctuation_pattern = re.compile(r'[^\w\s]')  # Removes punctuation
number_pattern = re.compile(r'\d+')  # Removes numbers
whitespace_pattern = re.compile(r'\s+')  # Removes excessive whitespace
hashtag_pattern = re.compile(r'#\w+.*')  # Removes hashtags and all text after them

# Optimized function
def preprocess_text(text):
    if pd.isna(text):
        return "", []  # Handle missing values gracefully

    # Extract emojis
    emojis = emoji_pattern.findall(text)  # List of emojis

    # Remove hashtags and text following them
    text = hashtag_pattern.sub('', text)

    # Remove emojis, URLs, mentions, punctuation, and numbers
    text = emoji_pattern.sub('', text)  # Remove emojis
    text = url_pattern.sub('', text)  # Remove URLs
    text = mention_pattern.sub(r'\1', text)  # Remove "@" but keep mention names
    text = punctuation_pattern.sub('', text)  # Remove punctuation
    text = number_pattern.sub('', text)  # Remove numbers

    # Remove excessive whitespace and trim
    text = whitespace_pattern.sub(' ', text).strip()

    return text, emojis

# Apply preprocessing to create new columns
df_posts[['text', 'emojis']] = df_posts['text'].apply(lambda x: pd.Series(preprocess_text(x)))

# Display a few rows to check the results
print(df_posts[['text', 'emojis']].head())

                                                text  emojis
0  running a business means juggling countless ad...      []
1  liz truss is walking in the lingering shadow o...      []
2  the uk is bracing for war as government buildi...    [🇺🇦]
3  marrying a second or third cousin once removed...     [🧬]
4  its truly disgraceful how the indian national ...  [🤦, ♂]


In [11]:
df_posts

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags,emojis
0,2024-10-31 00:00:00,running a business means juggling countless ad...,2018569761,danielwoodard,1077866112,"[#HRtech, #businessmanagement]",[]
1,2024-10-31 00:00:00,liz truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430,[#politics],[]
2,2024-10-31 00:00:00,the uk is bracing for war as government buildi...,2059143248,ihooper,1007478642,"[#Ukrainewashed, #WarPreparedness]",[🇺🇦]
3,2024-10-31 00:00:00,marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480,"[#FamilyTree, #GeneticFacts]",[🧬]
4,2024-10-31 00:00:00,its truly disgraceful how the indian national ...,2001239278,michael51,1021455936,[#RationChorCongress],"[🤦, ♂]"
...,...,...,...,...,...,...,...
70255,2024-10-31 23:59:52,bakerjulie saranghae im your resident kpop lov...,2086649509,valdezjennifer,1094330726,"[#KPopArt, #StayCreative]","[🌸✨, 💖🎨]"
70256,2024-10-31 23:59:52,soaring to new heights with sweeneyanthony the...,2039889186,nashshaun,1015245531,"[#caesarstonesa, #designinspiration, #modernin...","[✨, 🚀]"
70257,2024-10-31 23:59:54,hey james paul did you see the latest from axa...,2020468196,brownregina,1029384492,"[#AXA, #truthprevails]",[]
70258,2024-10-31 23:59:58,check out the fundraiser exhibition by joanna ...,2037744299,dkey,1046050046,"[#artandmentalhealth, #hauserwirth, #hospitalr...",[]


## Remove stopwords

In [15]:
import spacy
import pandas as pd

# Load the spaCy language model
nlp = spacy.load("en_core_web_lg")

# Function to remove stopwords from a single row of text
def remove_stopwords_spacy(text):
    if pd.isna(text):  # Handle missing values gracefully
        return ""
    doc = nlp(text)  # Process the text using spaCy
    filtered_text = ' '.join([token.text for token in doc if not token.is_stop])
    return filtered_text

# Apply the function to the 'text' column of the DataFrame
df_posts['text'] = df_posts['text'].apply(remove_stopwords_spacy)

# Lemmatization

Use lemmatization since stemming can less to less accurate results (even non-words)