# Preprocessing
1. Convert text to lowercase 
2. Remove URLs, mentions, and special characters (besides hashtags and emojis)
3. Remove stop words
4. Tokenize text
5. Perform stemming or lemmatization

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Download stop words (only needed once)
#nltk.download('stopwords')

# Get English stop words list
stop_words = set(stopwords.words('english'))

In [2]:
# Load the JSON file
df_posts = pd.read_json('../dataset.json')

df_posts

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31 00:00:00,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31 00:00:00,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31 00:00:00,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31 00:00:00,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31 00:00:00,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936
...,...,...,...,...,...
70255,2024-10-31 23:59:52,"@bakerjulie: ""Saranghae, I’m your resident K-P...",2086649509,valdezjennifer,1094330726
70256,2024-10-31 23:59:52,Soaring to new heights with @sweeneyanthony! T...,2039889186,nashshaun,1015245531
70257,2024-10-31 23:59:54,"Hey @james20 @paul47, did you see the latest f...",2020468196,brownregina,1029384492
70258,2024-10-31 23:59:58,Check out the fundraiser exhibition by @joanna...,2037744299,dkey,1046050046


Check for rows with no text

In [3]:
# Display rows where 'text' is missing (NaN)
missing_text_rows = df_posts[df_posts['text'].isnull()]

# Display the DataFrame with missing text
print(missing_text_rows)

Empty DataFrame
Columns: [timestamp, text, text_id, user, user_id]
Index: []


## Convert text to lowercase

In [4]:
df_posts['text'] = df_posts['text'].str.lower()
df_posts

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31 00:00:00,running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31 00:00:00,liz truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31 00:00:00,the uk is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31 00:00:00,marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31 00:00:00,it's truly disgraceful how the indian national...,2001239278,michael51,1021455936
...,...,...,...,...,...
70255,2024-10-31 23:59:52,"@bakerjulie: ""saranghae, i’m your resident k-p...",2086649509,valdezjennifer,1094330726
70256,2024-10-31 23:59:52,soaring to new heights with @sweeneyanthony! t...,2039889186,nashshaun,1015245531
70257,2024-10-31 23:59:54,"hey @james20 @paul47, did you see the latest f...",2020468196,brownregina,1029384492
70258,2024-10-31 23:59:58,check out the fundraiser exhibition by @joanna...,2037744299,dkey,1046050046


## Remove URLs, Mentions, and Special Characters

In [5]:
# Function to extract hashtags and emojis, clean text
def preprocess_text(text):
    if pd.isna(text):
        return "", [], []  # Handle missing values gracefully

    # Extract hashtags
    hashtags = re.findall(r'#\w+', text)
    
    # Extract emojis
    emojis = re.findall(r'[^\w\s,]', text)  # This captures emojis by keeping non-word, non-space symbols

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove special characters (but not emojis)
    text = re.sub(r'[^a-zA-Z\s#]', '', text)  # Keeps hashtags, letters, and spaces, removes other special characters
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text, hashtags, emojis

# Apply preprocessing to overwrite 'text' and create 'hashtags' and 'emojis' columns
df_posts[['text', 'hashtags', 'emojis']] = df_posts['text'].apply(lambda x: pd.Series(preprocess_text(x)))

# Display a few rows to check the results
print(df_posts[['text', 'hashtags', 'emojis']].head())

                                                text  \
0  running a business means juggling countless ad...   
1  liz truss is walking in the lingering shadow o...   
2  the uk is bracing for war as government buildi...   
3  marrying a second or third cousin once removed...   
4  its truly disgraceful how the indian national ...   

                             hashtags                                   emojis  
0      [#hrtech, #businessmanagement]  [., :, /, /, ., /, :, /, /, ., /, #, #]  
1                         [#politics]                                [., ., #]  
2  [#ukrainewashed, #warpreparedness]                       [., 🇺, 🇦, #, ., #]  
3        [#familytree, #geneticfacts]                       [', !, ., 🧬, #, #]  
4               [#rationchorcongress]              [', ., #, 🤦, ‍, ♂, ️, ', !]  


Bug: emoji columns doesn't display emojis that good yet

## Remove stopwords

In [6]:
# Function to remove stop words
def remove_stop_words(text):
    words = text.split()  # Split text into individual words
    filtered_words = [word for word in words if word not in stop_words]  # Keep only non-stop words
    return ' '.join(filtered_words)  # Join words back into a cleaned sentence

# Apply stop word removal to the 'text' column
df_posts['text'] = df_posts['text'].apply(remove_stop_words)

# Display a few rows to check the results
print(df_posts[['text']].head())

                                                text
0  running business means juggling countless admi...
1  liz truss walking lingering shadow predecessor...
2  uk bracing war government buildings london rai...
3  marrying second third cousin removed isnt tabo...
4  truly disgraceful indian national congress sto...
