# Preprocessing
1. Convert text to lowercase 
2. Remove URLs, mentions, and special characters (besides hashtags and emojis)
3. Remove stop words
4. Perform stemming/lemmatization

Download the spaCy model
Can be installed via `python -m spacy download en_core_web_sm`

In [44]:
import pandas as pd
import re
import spacy

In [45]:
# Load the JSON file
df_posts = pd.read_json('../dataset.json')
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936


In [46]:
#nlp = spacy.load("en_core_web_trf")
# def extract_entities(text):
#     """
#     Extracts named entities from text using SpaCy's NER model.
    
#     Args:
#     text (str): The text from which to extract named entities.

#     Returns:
#     list: A list of tuples where each tuple contains (entity_text, entity_label).
#     """
#     if not text or pd.isna(text):
#         return []  # Return an empty list if text is missing
    
#     # Process text with SpaCy
#     doc = nlp(text)
    
#     # Extract entity text and labels
#     entities = [(ent.text, ent.label_) for ent in doc.ents]
    
#     return entities

# df_posts['entities'] = df_posts['text'].apply(extract_entities)

Check for rows with no text

In [47]:
# Display rows where 'text' is missing (NaN)
missing_text_rows = df_posts[df_posts['text'].isnull()]
missing_text_rows

Unnamed: 0,timestamp,text,text_id,user,user_id


### Convert variables

Remove Hashtags to preserve casings

In [48]:
# Apply re.findall() to each row in the 'text' column to extract hashtags
df_posts['hashtags'] = df_posts['text'].apply(lambda x: re.findall(r'#\w+', x) if isinstance(x, str) else [])

Convert text to lowercase

In [49]:
df_posts['text'] = df_posts['text'].str.lower()

Remove Date from Timestamp

In [50]:
# All Dates are the same
unique_dates = df_posts['timestamp'].dt.date.unique()
unique_dates

array([datetime.date(2024, 10, 31)], dtype=object)

In [51]:
df_posts['timestamp'] = df_posts['timestamp'].dt.time
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id,hashtags
0,00:00:00,running a business means juggling countless ad...,2018569761,danielwoodard,1077866112,"[#HRtech, #businessmanagement]"
1,00:00:00,liz truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430,[#politics]
2,00:00:00,the uk is bracing for war as government buildi...,2059143248,ihooper,1007478642,"[#Ukrainewashed, #WarPreparedness]"
3,00:00:00,marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480,"[#FamilyTree, #GeneticFacts]"
4,00:00:00,it's truly disgraceful how the indian national...,2001239278,michael51,1021455936,[#RationChorCongress]


### Remove URLs, Mentions, and Special Characters

In [52]:
# Pre-compile regex patterns
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags
    u"\U00002700-\U000027BF"  # dingbats
    u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
    u"\U00002600-\U000026FF"  # miscellaneous symbols
    u"\U00002B50-\U00002B55"  # stars
    "]+", flags=re.UNICODE)

url_pattern = re.compile(r'http\S+|www\S+|https\S+')
mention_pattern = re.compile(r'@(\w+)')  # Removes "@" but keeps the mention
punctuation_pattern = re.compile(r'[^\w\s]')  # Removes punctuation
number_pattern = re.compile(r'\d+')  # Removes numbers
whitespace_pattern = re.compile(r'\s+')  # Removes excessive whitespace
hashtag_pattern = re.compile(r'#\w+.*')  # Removes hashtags and all text after them

# Optimized function
def preprocess_text(text):
    if pd.isna(text):
        return "", []  # Handle missing values gracefully

    # Extract emojis
    emojis = emoji_pattern.findall(text)  # List of emojis

    # Remove hashtags and text following them
    text = hashtag_pattern.sub('', text)

    # Remove emojis, URLs, mentions, punctuation, and numbers
    text = emoji_pattern.sub('', text)  # Remove emojis
    text = url_pattern.sub('', text)  # Remove URLs
    text = mention_pattern.sub(r'\1', text)  # Remove "@" but keep mention names
    text = punctuation_pattern.sub('', text)  # Remove punctuation
    text = number_pattern.sub('', text)  # Remove numbers

    # Remove excessive whitespace and trim
    text = whitespace_pattern.sub(' ', text).strip()

    return text, emojis

# Apply preprocessing to create new columns
df_posts[['text', 'emojis']] = df_posts['text'].apply(lambda x: pd.Series(preprocess_text(x)))

# Display head to check the results
df_posts[['text', 'emojis']].head()

Unnamed: 0,text,emojis
0,running a business means juggling countless ad...,[]
1,liz truss is walking in the lingering shadow o...,[]
2,the uk is bracing for war as government buildi...,[🇺🇦]
3,marrying a second or third cousin once removed...,[🧬]
4,its truly disgraceful how the indian national ...,"[🤦, ♂]"


## Remove stopwords

In [53]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"]) # disable ner and parser as these are not needed for stop word removal and lemmatizer

In [54]:
def remove_stopwords_fast(text):
    if pd.isna(text):
        return ""
    doc = nlp.make_doc(text)  # Only tokenization
    filtered_text = ' '.join([token.text for token in doc if not token.is_stop])
    return filtered_text

# Apply the function to the 'text' column of the DataFrame
df_posts['text'] = df_posts['text'].apply(remove_stopwords_fast)

# Lemmatization

Use lemmatization since stemming can lead to less accurate results (even non-words)

In [55]:
def lemmatize_text_spacy(text):
    if pd.isna(text):  # Handle missing values
        return ""
    doc = nlp(text)  # Process the text using spaCy
    lemmatized_text = ' '.join([token.lemma_ for token in doc if not token.is_stop])  # Lemmatize each word
    return lemmatized_text

# Apply lemmatization to the 'text' column
df_posts['text'] = df_posts['text'].apply(lemmatize_text_spacy)

KeyboardInterrupt: 

### Write the output to csv file

In [56]:
df_posts.to_csv('../output/preprocessed.csv', index=False)