# Text Preprocessing: Custom Stopwords

In [1]:
# Import packages
import nltk
from nltk.corpus import stopwords
from pprint import pprint
import yaml
import sys
import os
import re
import requests
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import stanza
from nltk.probability import FreqDist

In [3]:
pd.set_option('display.max_colwidth', None)

enron_emails_df=pd.read_csv('../main_data.csv')
enron_emails_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480062 entries, 0 to 480061
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   Message-ID                 480062 non-null  object
 1   Date                       480062 non-null  object
 2   Time                       480062 non-null  int64 
 3   From                       480062 non-null  object
 4   To                         480062 non-null  object
 5   Subject                    480062 non-null  object
 6   X-cc                       480062 non-null  object
 7   X-bcc                      480062 non-null  object
 8   Content                    480062 non-null  object
 9   Job_Title                  480062 non-null  object
 10  Total_Sentence_Word_Count  480062 non-null  int64 
dtypes: int64(2), object(9)
memory usage: 40.3+ MB


In [4]:
# Download stopwords (only the first time)
# nltk.download("stopwords")

stop_list = nltk.corpus.stopwords.words('english')
print(stop_list)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
# Tokenize emails into words
emails2 = [nltk.word_tokenize(email.lower()) for email in enron_emails_df['Content']]

# Keep only alphabetic words
emails3 = [[w for w in email if re.search('^[a-z]+$', w)] for email in emails2]

# Remove stopwords
emails4 = [[w for w in doc if w not in stop_list] for doc in emails3]

# Flatten the list of lists
all_words = [word for email in emails4 for word in email]

# Count word frequencies
fdist = FreqDist(all_words)

# Get the 50 most common words
print(fdist.most_common(50))

## Add Custom Stopwords

In [6]:
# Check the frequency of words with length <= 4

text2_long_words = [w for w in all_words if len(w) <= 4]
fdist2 = FreqDist(text2_long_words)
print(fdist2.most_common(30))

[('ect', 6045), ('pm', 3949), ('mark', 3276), ('new', 2993), ('cc', 2936), ('may', 2814), ('iso', 2759), ('ferc', 2751), ('said', 2533), ('time', 2340), ('e', 2285), ('mary', 2176), ('also', 2119), ('call', 1941), ('list', 1601), ('need', 1550), ('one', 1509), ('know', 1504), ('ees', 1504), ('get', 1446), ('data', 1366), ('rate', 1349), ('like', 1303), ('us', 1299), ('make', 1212), ('see', 1204), ('sent', 1198), ('http', 1192), ('send', 1186), ('last', 1149)]


In [7]:
# Check the frequency of specific words

# print(all_words.count('mail')) #779
# print(all_words.count('forwarded')) #1577
# print(all_words.count('thank')) #666
print(all_words.count('http')) #1192

1192


In [None]:
# Assess if a specific word is useful based on collocation pairs 

from nltk.collocations import BigramCollocationFinder

# Step 1: Create a BigramCollocationFinder from the tokenized word list
bigram_finder = BigramCollocationFinder.from_words(all_words)

# Step 2: Get all bigrams containing "time"
bigrams_with_said = [bigram for bigram in bigram_finder.ngram_fd.keys() if 'time' in bigram]  # edit to check diff collocation pairs

# Step 3: Sort bigrams by frequency (most common first)
sorted_bigrams = sorted(bigrams_with_said, key=lambda bigram: bigram_finder.ngram_fd[bigram], reverse=True)

# Step 4: Print the top 10 most common bigrams with "time"
print(sorted_bigrams[:10])

[('corp', 'time'), ('time', 'fri'), ('first', 'time'), ('time', 'sat'), ('real', 'time'), ('time', 'date'), ('describe', 'time'), ('next', 'time'), ('houston', 'time'), ('people', 'time')]


In [9]:
# Load stopwords
stop_words = set(stopwords.words("english"))

# Add more custom stopwords
custom_stopwords = {"enron", "email", "subject", "company", "corporate", "mary", "hain", "hou", 'ect', "mark", "hainhouect", "haedickehouectect", "ect", "please", "would", "pm", "cc", "may", "e", "forwarded", "attached", "attach", "thanks", "could", "mail", "mailing", "bcc", "dear", "thru", "forwarded", "hi", "hello", "much", "really", "susan", "j", "q", "p", "pls", "thank", "ps", "sorry", "also", "might", "must", "call", "fw", "fwd", "date", "sincerely", "sent", "http", "list", "asap", "corp"}  # Additional custom stopwords
stop_words.update(custom_stopwords)  # Add to NLTK stopwords