# Preprocessing
1. Convert text to lowercase 
2. Remove URLs, mentions, and special characters (besides hashtags and emojis)
3. Remove stop words
4. Perform stemming/lemmatization

Download the spaCy model
Can be installed via `python -m spacy download en_core_web_sm`

Download Pre-Trained-Language Model:

Can be installed via wget: `wget -P /path/to/your/directory https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin`

or via curl: `curl -O https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin`

In [1]:
import pandas as pd
import re
import spacy
import fasttext

In [2]:
# Load the JSON file
df_posts = pd.read_json('../data/dataset.json')
df_posts.head()

Unnamed: 0,timestamp,text,text_id,user,user_id
0,2024-10-31,Running a business means juggling countless ad...,2018569761,danielwoodard,1077866112
1,2024-10-31,Liz Truss is walking in the lingering shadow o...,2092717718,nelsonjacqueline,1089670430
2,2024-10-31,The UK is bracing for war as government buildi...,2059143248,ihooper,1007478642
3,2024-10-31,Marrying a second or third cousin once removed...,2008209828,wrightnicholas,1039258480
4,2024-10-31,It's truly disgraceful how the Indian National...,2001239278,michael51,1021455936


In [16]:
# Info this can take over 30m!!
# spacy.prefer_gpu()  # Prefers GPU but doesn't crash if unavailable
# nlp = spacy.load("en_core_web_trf")
# def extract_entities(text):
#     """
#     Extracts named entities from text using SpaCy's NER model.

#     Args:
#     text (str): The text from which to extract named entities.

#     Returns:
#     list: A list of tuples where each tuple contains (entity_text, entity_label).
#     """
#     if not text or pd.isna(text):
#         return []  # Return an empty list if text is missing

#     # Process text with SpaCy
#     doc = nlp(text)

#     # Extract entity text and labels
#     entities = [(ent.text, ent.label_) for ent in doc.ents]

#     return entities

# def preprocess_text(text):
#     """
#     Preprocesses text by removing URLs and emojis while keeping mentions and hashtags intact.

#     Args:
#     text (str): The original text.

#     Returns:
#     str: Preprocessed text.
#     """
#     if not text or pd.isna(text):
#         return ""  # Return empty string if text is missing

#     # Remove URLs
#     text = re.sub(r"http\S+|www\S+|https\S+", '', text)
#     # Remove emojis
#     text = emoji.replace_emoji(text, replace="")
    
#     return text.strip()

# # Create a preprocessed text column
# df_posts['preprocessed_text'] = df_posts['text'].apply(preprocess_text)

# # Apply NER extraction on the preprocessed text
# df_posts['entities'] = df_posts['preprocessed_text'].apply(extract_entities)

Check for rows with no text

In [3]:
# Display rows where 'text' is missing (NaN)
missing_text_rows = df_posts[df_posts['text'].isnull()]
missing_text_rows

Unnamed: 0,timestamp,text,text_id,user,user_id


### Convert variables

Move hashtags to a new column 'hashtags'

In [21]:
# Apply re.findall() to each row in the 'text' column to extract hashtags
df_posts['hashtags'] = df_posts['text'].apply(lambda x: re.findall(r'#\w+', x) if isinstance(x, str) else [])

Move mentions to a new column 'mentions'

In [22]:
# Extract mentions from the 'text' column, remove the '@' symbol, and create a new column 'mentions'
df_posts['mentions'] = df_posts['text'].apply(lambda x: [mention[1:] for mention in re.findall(r'@\w+', x)] if isinstance(x, str) else [])

Convert text to lowercase

In [23]:
df_posts['text'] = df_posts['text'].str.lower()

Remove Date from Timestamp

In [None]:
# All Dates are the same
unique_dates = df_posts['timestamp'].dt.date.unique()
unique_dates

In [None]:
df_posts['timestamp'] = df_posts['timestamp'].dt.time
df_posts.head()

### Remove URLs, Mentions, and Special Characters

Leave apostrophes in here for better lemmatization performance

In [None]:
import re
import pandas as pd

# Pre-compile regex patterns
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags
    u"\U00002700-\U000027BF"  # dingbats
    u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
    u"\U00002600-\U000026FF"  # miscellaneous symbols
    u"\U00002B50-\U00002B55"  # stars
    "]+", flags=re.UNICODE)

url_pattern = re.compile(r'http\S+|www\S+|https\S+')
mention_pattern = re.compile(r'@\w+')  # Removes mentions
punctuation_pattern = re.compile(r"[^\w\s'’]")  # Removes punctuation but keeps apostrophes
number_pattern = re.compile(r'\d+')  # Removes numbers
whitespace_pattern = re.compile(r'\s+')  # Removes excessive whitespace
hashtag_pattern = re.compile(r'#\w+')  # Removes hashtags and all text after them

# List of terms to remove
remove_tw_terms = ["cc", "cx", "ct", "dm", "ht", "mt", "prt", "rt", "followback", "follow back", "fb", "retweet", "retweets"]

# Compile regex to match terms (case insensitive and whole word)
remove_terms_pattern = re.compile(r'\b(' + '|'.join(remove_tw_terms) + r')\b')

# Optimized function
def preprocess_text(text):
    if pd.isna(text):
        return "", []  # Handle missing values gracefully

    # Extract emojis
    emojis = emoji_pattern.findall(text)  # List of emojis

    # Remove hashtags and text following them
    text = hashtag_pattern.sub('', text)

    # Remove emojis, URLs, mentions, punctuation, and numbers
    text = emoji_pattern.sub('', text)  # Remove emojis
    text = url_pattern.sub('', text)  # Remove URLs
    text = mention_pattern.sub('', text)  # Remove mentions
    text = punctuation_pattern.sub('', text)  # Remove punctuation
    text = number_pattern.sub('', text)  # Remove numbers

    # Remove specific terms (CC, CX, CT, DM, etc.)
    text = remove_terms_pattern.sub('', text)

    # Remove excessive whitespace and trim
    text = whitespace_pattern.sub(' ', text).strip()

    return text, emojis

# Apply preprocessing to create new columns
df_posts[['text', 'emojis']] = df_posts['text'].apply(lambda x: pd.Series(preprocess_text(x)))

# Display head to check the results
df_posts[['text', 'emojis']].head()

Test output

In [27]:
df_posts.to_csv('../output/testing.csv', index=False)

## Check for duplicates

In [None]:
# Total duplicate rows
total_duplicate_rows = df_posts['text'].duplicated(keep=False).sum()

# Number of unique duplicate tweets
unique_duplicate_tweets = (df_posts['text'].value_counts() > 1).sum()

print(f"Total duplicate rows: {total_duplicate_rows}")
print(f"Unique duplicate tweets: {unique_duplicate_tweets}")

# Duplicate stats

In [None]:
# Get the frequency distribution of tweets
frequency_distribution = df_posts['text'].value_counts()

# Filter for only duplicates (frequency > 1)
duplicate_tweet_frequencies = frequency_distribution[frequency_distribution > 1]

# Summary statistics
print(duplicate_tweet_frequencies.describe())

# How many tweets are repeated more than 5 times?
highly_duplicated = (duplicate_tweet_frequencies > 5).sum()
print(f"Number of tweets repeated more than 5 times: {highly_duplicated}")

Remove duplicate tweets but extract count

In [None]:
df_posts['frequency'] = df_posts['text'].map(df_posts['text'].value_counts())

df_posts = df_posts.drop_duplicates(subset='text', keep='first').reset_index(drop=True)

In [None]:
print(df_posts[['text', 'frequency']].head(10))

In [None]:
df_posts[df_posts['user'] == 'reginabarnes']

In [None]:
rb_twets = df_posts[df_posts['user'] == 'reginabarnes']  # Filter rows for 'reginabarnes'
rb_twets.to_csv('../output/rb_tweets.csv', index=False)  # Save to CSV

Indentify language

In [None]:
# Load the pre-trained language identification model
model = fasttext.load_model('lid.176.bin')

# Function to detect language using FastText
def detect_language_fasttext(text, threshold=0.9):
    if pd.isna(text) or not text.strip():
        return "unknown"  # Handle empty or missing text
    try:
        predictions = model.predict(text)  # Predict the language
        lang_code = predictions[0][0].replace("__label__", "")  # Extract language code
        return lang_code
    except Exception as e:
        return "unknown"

# Apply language detection to the 'text' column
df_posts['language'] = df_posts['text'].apply(detect_language_fasttext)

# Filter non-English tweets
non_english_tweets = df_posts[df_posts['language'] != 'en']

# Display the number of non-English tweets and a sample
print(f"Number of non-English tweets: {len(non_english_tweets)}")
print(non_english_tweets[['text', 'language']])

non_english_tweets.to_csv('../output/non_english_tweets.csv', index=False)

## Use spacy

Load en_core_web_sm for spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

### Lemmatization

Use lemmatization since stemming can lead to less accurate results (even non-words)

In [30]:
# Info this takes about 2.5m!!
df_posts['text'] = df_posts['text'].apply(
        lambda text: ' '.join([token.lemma_ for token in nlp(text)])
)

### Create output for sentiment analysis

Keep stopwords

In [31]:
df_posts.to_csv('../output/preprocessed_for_SA.csv', index=False)

### Remove stopwords

In [32]:
df_posts['text'] = df_posts['text'].fillna("").apply(
    lambda text: ' '.join([token.text for token in nlp.make_doc(text) if not token.is_stop])
)

### Remove Apostrophes

In [33]:
df_posts['text'] = df_posts['text'].str.replace(r"[’']", "", regex=True)

### Check again for empty rows

In [None]:
# Display rows where 'text' is missing (NaN)
missing_text_rows = df_posts[df_posts['text'].isnull()]
missing_text_rows

### Write the output to csv file

In [35]:
df_posts.to_csv('../output/preprocessed.csv', index=False)

In [10]:
df_posts[df_posts['frequency'] > 7]

Unnamed: 0,timestamp,text,text_id,user,user_id,frequency
261,2024-10-31 00:03:26,Check out this article for a rapid technique t...,2042924876,wadekaren,1059374550,8
4715,2024-10-31 01:20:49,Regain your health and well-being by shedding ...,2086330600,annflores,1031522730,8
4825,2024-10-31 01:23:13,Unlock the secrets to shedding fat successfull...,2097208346,annflores,1031522730,8
4961,2024-10-31 01:25:26,Discover the secrets of successful home busine...,2063586212,annflores,1031522730,8
5021,2024-10-31 01:26:32,Ready to take your fitness to the next level? ...,2082803004,annflores,1031522730,8
5083,2024-10-31 01:27:40,Struggling to get rid of cellulite and get bac...,2028891694,annflores,1031522730,8
8029,2024-10-31 02:22:05,Don't put your health at risk by ignoring unwa...,2073970213,annflores,1031522730,8
8030,2024-10-31 02:22:05,Revitalize your work-at-home business with thi...,2065031382,annflores,1031522730,8
8407,2024-10-31 02:28:59,Looking to shed some extra pounds and improve ...,2083022452,annflores,1031522730,8
9802,2024-10-31 02:55:20,Don't put your health at risk by storing exces...,2084074601,annflores,1031522730,8


In [12]:
print(df_posts[df_posts['user'] == 'reginabarnes']['text'])

47       WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
653      WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
1277     WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
1855     WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
6266     WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
                               ...                        
46532    WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
46729    WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
46949    WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
47172    WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
47375    WWF No Mercy USA-1 VARIANT Nintendo 64 N64 Car...
Name: text, Length: 86, dtype: object
