In [21]:
import json
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/abigail/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abigail/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/abigail/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
# Data Extraction - Train and Test data

with open('emaildata.json', 'r') as file:
    emaildata_json = json.load(file)

# contains the contents of the JSON file as a Python dictionary
print(emaildata_json)

[{'Sender Address': 'team@today.getpocket.com', 'Sender Name': 'Pocket', 'Subject': 'How to Decompress Your Spine', 'Content': 'Gaslighting, AI Spokesmodels, and Jolene  Ten picks for today, curated by our editors. Plus, an eclipse reading list.     \n 1      Must Read  How Anti-Vaccine Activists and the Far Right Are Trying to Build a Parallel Economy NPR \nLisa Hagen  Save \n \n \n \n 2      Good Question  Why Is There so Much Lead in American Food? Vox \nDylan Scott  Save \n \n \n \n 3      Did a Cat Write This?  Cats Aren’t Jerks. They’re Just Misunderstood. The Washington Post \nColleen Grablick  Save \n \n \n \n 4      Weird History  He Used Plastic Surgery to Raise Rock Stars From the Dead Rolling Stone \nDavid Browne  Save \n \n \n \n 5      Pop Psychology  So You Think You’ve Been Gaslit The New Yorker \nLeslie Jamison  Save \n \n \n \n 6      Get Up to Speed  How an Automated Spokesmodel Drove the Internet Insane New York Magazine \nJohn Herrman  Save \n \n \n \n 7      Whole

In [23]:
# Data Pre-Processing - Part 1 (define function)

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove email-specific patterns (email addresses, URLs, HTML tags, etc.)
    text = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', '', text)
    text = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', '', text)
    text = re.sub(r'<[^>]*>', '', text)
    text = ' '.join([''.join([c for i, c in enumerate(word) if i < 2 or not (c == word[i - 1] == word[i - 2])]) for word in text.split()])

    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # # Lemmatization
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Example usage
text = "Hello! This is a sample email with some punctuation, stopwords, and HTML <tags>. You can contact me at user@example.com"
preprocessed_text = preprocess_text(text)
print("Preprocessed Text:", preprocessed_text)


Preprocessed Text: hello sampl email punctuat stopword html contact


In [24]:
# Data Pre-Processing - Part 2 (apply to email data)

# apply to emaildata_json
preprocessed_emails = []
for email in emaildata_json:
    preprocessed_content = preprocess_text(email['Content'])  # Assuming the email content is stored in the 'Content' field
    
    # Create a new dictionary with preprocessed content and other fields
    preprocessed_email = {
        "Sender Address": email["Sender Address"],
        "Sender name": email["Sender Name"],
        "Subject": email["Subject"],
        "Content": preprocessed_content,
        "Category": email["Category"]
    }
    
    preprocessed_emails.append(preprocessed_email)


In [25]:
preprocessed_emails

[{'Sender Address': 'team@today.getpocket.com',
  'Sender name': 'Pocket',
  'Subject': 'How to Decompress Your Spine',
  'Content': 'gaslight ai spokesmodel jolen ten pick today curat editor plu eclips read list 1 must read antivaccin activist far right tri build parallel economi npr lisa hagen save 2 good question much lead american food vox dylan scott save 3 cat write cat ’ jerk ’ misunderstood washington post colleen grablick save 4 weird histori use plastic surgeri rais rock star dead roll stone david brown save 5 pop psycholog think ’ gaslit new yorker lesli jamison save 6 get speed autom spokesmodel drove internet insan new york magazin john herrman save 7 wholebodi health ’ dental health consid primari medic care knowabl magazin lola butcher save 8 beyond junk five legal altern use storag unit lifehack jeff somer save 9 zeitgeist romant rivalri behind dolli parton ’ ‘ jolen ’ mental floss kenneth partridg save 10 sweet relief decompress spine reduc back ach pain stylist chloe 

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


# Extract features (e.g., Subject and Content) and labels (Category)
X = [email['Subject'] + ' ' + email['Content'] for email in preprocessed_emails]
y = [email['Category'] for email in preprocessed_emails]

# Split the data into training and testing sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)


# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_X)
X_test_tfidf = vectorizer.transform(test_X)

# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, train_y)

# Evaluate the model
y_pred = clf.predict(X_test_tfidf)
print(classification_report(test_y, y_pred))


              precision    recall  f1-score   support

   Marketing       1.00      0.17      0.29        12
    Personal       0.00      0.00      0.00         3
     Updates       0.62      1.00      0.76        21

    accuracy                           0.64        36
   macro avg       0.54      0.39      0.35        36
weighted avg       0.69      0.64      0.54        36



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
# tuning

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Extract features (e.g., Subject and Content) and labels (Category)
X = [email['Subject'] + ' ' + email['Content'] for email in preprocessed_emails]
y = [email['Category'] for email in preprocessed_emails]

# Split the data into training and testing sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_X)
X_test_tfidf = vectorizer.transform(test_X)

# Define the parameter grid to search
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0],  # Values to try for the alpha parameter
    # Add more parameters to tune if needed
}

# Create the grid search
grid_search = GridSearchCV(
    estimator=MultinomialNB(),
    param_grid=param_grid,
    scoring='accuracy',  # You can use other scoring metrics if needed
    cv=5,  # 5-fold cross-validation
    n_jobs=-1  # Use all available CPU cores
)

# Perform the grid search
grid_search.fit(X_train_tfidf, train_y)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Evaluate the model on the test set using the best parameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_tfidf)
print(classification_report(test_y, y_pred))


Best Parameters: {'alpha': 0.1}
              precision    recall  f1-score   support

   Marketing       0.73      0.67      0.70        12
    Personal       0.00      0.00      0.00         3
     Updates       0.72      0.86      0.78        21

    accuracy                           0.72        36
   macro avg       0.48      0.51      0.49        36
weighted avg       0.66      0.72      0.69        36



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
# check for overfitting

from sklearn.metrics import accuracy_score

# Evaluate the best model on the training set
train_y_pred = best_model.predict(X_train_tfidf)
train_accuracy = accuracy_score(train_y, train_y_pred)

# Print the training accuracy
print("Training Accuracy:", train_accuracy)

# Evaluate the model on the test set using the best parameters
test_y_pred = best_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(test_y, test_y_pred)

# Print the test accuracy
print("Test Accuracy:", test_accuracy)

# Print the classification report for the test set
print(classification_report(test_y, test_y_pred))


Training Accuracy: 0.9861111111111112
Test Accuracy: 0.7222222222222222
              precision    recall  f1-score   support

   Marketing       0.73      0.67      0.70        12
    Personal       0.00      0.00      0.00         3
     Updates       0.72      0.86      0.78        21

    accuracy                           0.72        36
   macro avg       0.48      0.51      0.49        36
weighted avg       0.66      0.72      0.69        36



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# do it again with validation set

from sklearn.model_selection import train_test_split

# Split the data into training, validation, and test sets
train_X, temp_X, train_y, temp_y = train_test_split(X, y, test_size=0.2, random_state=42)
val_X, test_X, val_y, test_y = train_test_split(temp_X, temp_y, test_size=0.5, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(train_X)
X_val_tfidf = vectorizer.transform(val_X)
X_test_tfidf = vectorizer.transform(test_X)

# Define the parameter grid to search
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0],  # Values to try for the alpha parameter
    # Add more parameters to tune if needed
}

# Create the grid search
grid_search = GridSearchCV(
    estimator=MultinomialNB(),
    param_grid=param_grid,
    scoring='accuracy',  # You can use other scoring metrics if needed
    cv=5,  # 5-fold cross-validation
    n_jobs=-1  # Use all available CPU cores
)

# Perform the grid search using the training and validation sets
grid_search.fit(X_train_tfidf, train_y)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_y_pred = best_model.predict(X_test_tfidf)

# Print the classification report for the test set
print(classification_report(test_y, test_y_pred))


Best Parameters: {'alpha': 0.1}
              precision    recall  f1-score   support

   Marketing       0.80      0.80      0.80         5
    Personal       0.00      0.00      0.00         2
     Updates       0.77      0.91      0.83        11

    accuracy                           0.78        18
   macro avg       0.52      0.57      0.54        18
weighted avg       0.69      0.78      0.73        18



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
