# NER

In [4]:
# Load Libraries
import pandas as pd
import nltk
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
from nltk.tree import Tree
import spacy
import re

pd.set_option('display.max_rows', None)   # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust the width to display full table
pd.set_option('display.max_colwidth', None)  # Adjust column width to avoid truncation


In [7]:
df = pd.read_csv("filter_50krows.csv")


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   From                       50000 non-null  object
 1   To                         50000 non-null  object
 2   Subject                    50000 non-null  object
 3   X-cc                       50000 non-null  object
 4   X-bcc                      50000 non-null  object
 5   Job_Title                  50000 non-null  object
 6   Total_Sentence_Word_Count  50000 non-null  int64 
 7   From_Names                 49962 non-null  object
 8   To_Names                   49263 non-null  object
 9   Cleaned_Content            50000 non-null  object
 10  BoW                        50000 non-null  object
 11  DateTime                   50000 non-null  object
dtypes: int64(1), object(11)
memory usage: 4.6+ MB


### NER Function

In [12]:
import pandas as pd
from tqdm.notebook import tqdm
import spacy
import time

# Enable tqdm for pandas
tqdm.pandas()

# Time the loading of the spaCy model
print("Loading spaCy model...")
start_time = time.time()
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "lemmatizer"])
model_load_time = time.time() - start_time
print(f"Model loaded in {model_load_time:.2f} seconds")

# Function to extract named entities for a single text
def extract_named_entities_spacy(text):
    """
    Extracts named entities from a single text using spaCy's pre-trained model.
    
    Args:
        text (str): The text to process.
    
    Returns:
        list: A list of (entity, label) tuples for the text.
    """
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.text.strip()]
    return entities


# Apply NER extraction with timing and progress_apply
start_time = time.time()
print("Applying NER to DataFrame...")
df['NER_Entities'] = df['Cleaned_Content'].progress_apply(extract_named_entities_spacy)
total_time = time.time() - start_time
print(f"Total application time: {total_time:.2f} seconds")



Loading spaCy model...
Model loaded in 3.34 seconds
Applying NER to DataFrame...


  0%|          | 0/50000 [00:00<?, ?it/s]

Total application time: 1821.68 seconds


In [14]:
df.to_csv('filtered_50krows_NER_Entities.csv', index=False)

### Run NER on Content

#### From NER results on Content column, there are several errors that need to be rectified:

- Remove enronenron name, OR trim names that end with enronenron OR corp enronenron
- Remove ect name, or trim names that end with ect
- Trim names that end with "hou"
- There are multiple instances of the same name in different formats (e.g. phillip k allen = phillip allen = pallen, kristin walsh = kristin)