# Information Extraction

In [1]:
# Load Libraries
import pandas as pd

pd.set_option('display.max_rows', None)   # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust the width to display full table
pd.set_option('display.max_colwidth', None)  # Adjust column width to avoid truncation


In [2]:
df = pd.read_csv("finalv1_data.csv")


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384846 entries, 0 to 384845
Data columns (total 13 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   From                       384846 non-null  object
 1   To                         384846 non-null  object
 2   Subject                    384846 non-null  object
 3   X-cc                       384846 non-null  object
 4   X-bcc                      384846 non-null  object
 5   Content                    384846 non-null  object
 6   Job_Title                  384846 non-null  object
 7   Total_Sentence_Word_Count  384846 non-null  int64 
 8   From_Names                 384846 non-null  object
 9   To_Names                   380598 non-null  object
 10  Cleaned_Content            384846 non-null  object
 11  BoW                        384846 non-null  object
 12  DateTime                   384846 non-null  object
dtypes: int64(1), object(12)
memory usage: 38.2+ 

## NER

### NER Function

In [4]:
from tqdm.notebook import tqdm
import spacy
import time

# Enable tqdm for pandas
tqdm.pandas()

# Time the loading of the spaCy model
print("Loading spaCy model...")
start_time = time.time()
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "lemmatizer"])
model_load_time = time.time() - start_time
print(f"Model loaded in {model_load_time:.2f} seconds")

# Function to extract named entities for a single text
def extract_named_entities_spacy(text):
    """
    Extracts named entities from a single text using spaCy's pre-trained model.
    
    Args:
        text (str): The text to process.
    
    Returns:
        list: A list of (entity, label) tuples for the text.
    """
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.text.strip()]
    return entities


# Apply NER extraction with timing and progress_apply
start_time = time.time()
print("Applying NER to DataFrame...")
df['NER_Entities'] = df['Cleaned_Content'].progress_apply(extract_named_entities_spacy)
total_time = time.time() - start_time
print(f"Total application time: {total_time:.2f} seconds")



Loading spaCy model...
Model loaded in 2.94 seconds
Applying NER to DataFrame...


  0%|          | 0/384846 [00:00<?, ?it/s]

Total application time: 9747.19 seconds


In [26]:
# df.to_csv('FULL_NER.csv', index=False)

PERSON:      People, including fictional.
NORP:        Nationalities or religious or political groups.
FAC:         Buildings, airports, highways, bridges, etc.
ORG:         Companies, agencies, institutions, etc.
GPE:         Countries, cities, states.
LOC:         Non-GPE locations, mountain ranges, bodies of water.
PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
EVENT:       Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART: Titles of books, songs, etc.
LAW:         Named documents made into laws.
LANGUAGE:    Any named language.
DATE:        Absolute or relative dates or periods.
TIME:        Times smaller than a day.
PERCENT:     Percentage, including ”%“.
MONEY:       Monetary values, including unit.
QUANTITY:    Measurements, as of weight or distance.
ORDINAL:     “first”, “second”, etc.
CARDINAL:    Numerals that do not fall under another type.

In [6]:
import pandas as pd

df = pd.read_csv('FULL_NER.csv')

In [8]:
df.NER_Entities.head()

0                               [('austin', 'PERSON')]
1      [('randy send', 'PERSON'), ('patti', 'PERSON')]
2    [('phillip allen pallenenroncom', 'PERSON'), (...
3    [('1', 'CARDINAL'), ('ke9davis', 'NORP'), ('is...
4    [('phillip k', 'PERSON'), ('10162000', 'CARDIN...
Name: NER_Entities, dtype: object

In [79]:
import pandas as pd
import ast

# Function to process NER_Entities into a dictionary of entity types and values
def process_ner_entities(row):
    # If row is a string, parse it into a Python object
    if isinstance(row, str):
        try:
            row = ast.literal_eval(row)
        except (ValueError, SyntaxError):
            return {}  # Return empty dict for unparsable rows
    
    # Ensure row is iterable
    if not isinstance(row, (list, tuple)):
        return {}  # Return empty dict for invalid rows
    
    # Process tuples into a dictionary
    result = {}
    for item in row:
        if isinstance(item, tuple) and len(item) == 2:  # Only process tuples with exactly 2 values
            entity, entity_type = item
            if entity_type not in result:
                result[entity_type] = []
            result[entity_type].append(entity)
        # Skip tuples with 1 value or other lengths
    
    # Remove duplicates within each type
    for entity_type in result:
        result[entity_type] = list(set(result[entity_type]))
    
    return result

# Add a new column with the processed entity dictionaries
df['Named_Entities'] = df['NER_Entities'].apply(process_ner_entities)

# Display the updated DataFrame
df[['Named_Entities']].head()

Unnamed: 0,Named_Entities
0,{'PERSON': ['austin']}
1,"{'PERSON': ['randy send', 'patti']}"
2,"{'PERSON': ['mike grigsby', 'phillip allen', 'phillip allen pallenenroncom', 'keith holst kholstenroncom', 'john lavorato', 'monique sanchez frank']}"
3,"{'CARDINAL': ['255255255248', '6421690110', '6421690105', '105891', '1'], 'NORP': ['ke9davis'], 'ORG': ['isp 2'], 'DATE': ['15116418 3 0413']}"
4,"{'PERSON': ['buckner buck', 'phillip k'], 'CARDINAL': ['75', '240', '10162000', '180', 'kwh', '60100', '180 240', '302'], 'MONEY': ['10122000 011221'], 'ORG': ['pan american frwy', 'pallenenroncom pallenenroncom', 'honeywell power systems inc 8725', 'buckner pe mba', 'honeywell', 'ldc san diego gas electric'], 'GPE': ['san diego'], 'TIME': ['68 hours'], 'DATE': ['87113 5057986424', '5052204129 8885013145', 'september']}"


In [66]:
# Count entity types across all rows
entity_type_counts = Counter()
for named_entities in test['Named_Entities']:
    for entity_type, entities in named_entities.items():
        entity_type_counts[entity_type] += len(entities)  # Count the number of entities for each type

# Get the top 5 most frequent entity types
top_5_entity_types = entity_type_counts.most_common(5)

# Display the results
print("Top 5 entity types overall:")
for entity_type, count in top_5_entity_types:
    print(f"{entity_type}: {count} occurrences")

Top 5 entity types overall:
PERSON: 2486963 occurrences
CARDINAL: 1189511 occurrences
DATE: 1146268 occurrences
ORG: 1012613 occurrences
GPE: 307905 occurrences


In [81]:
import pandas as pd
from collections import Counter

# Step 1: Count entity types across all rows (overall top 5 entity types)
entity_type_counts = Counter()
for named_entities in test['Named_Entities']:
    for entity_type, entities in named_entities.items():
        entity_type_counts[entity_type] += len(entities)  # Count total unique entities per type

# Get the top 5 most frequent entity types overall
top_5_entity_types = entity_type_counts.most_common(5)
print("Top 5 entity types overall:")
for entity_type, count in top_5_entity_types:
    print(f"{entity_type}: {count} occurrences")

# Step 2: Find top rows with the most entities for each of the top 5 entity types
# Add a column to store the count of each entity type per row for sorting
for entity_type, _ in top_5_entity_types:
    test[f'{entity_type}_count'] = test['Named_Entities'].apply(
        lambda x: len(x.get(entity_type, []))
    )
    
# Display top 5 most frequent values for each top entity type
for entity_type, _ in top_5_entity_types:
    print(f"\nTop 5 most frequent {entity_type} entity values:")
    top_values = entity_value_counts[entity_type].most_common(5)
    for value, count in top_values:
        print(f"{value}: {count} occurrences")

Top 5 entity types overall:
PERSON: 2486963 occurrences
CARDINAL: 1189511 occurrences
DATE: 1146268 occurrences
ORG: 1012613 occurrences
GPE: 307905 occurrences

Top 5 most frequent PERSON entity values:
jeff: 13623 occurrences
chris: 13320 occurrences
vince: 8044 occurrences
john: 7360 occurrences
mike: 7350 occurrences

Top 5 most frequent CARDINAL entity values:
one: 43648 occurrences
2: 38047 occurrences
1: 33034 occurrences
two: 30256 occurrences
3: 23295 occurrences

Top 5 most frequent DATE entity values:
today: 40303 occurrences
monday: 22361 occurrences
thursday: 21594 occurrences
tomorrow: 19428 occurrences
friday: 17904 occurrences

Top 5 most frequent ORG entity values:
ferc: 11773 occurrences
ena: 9775 occurrences
eol: 6780 occurrences
pge: 5518 occurrences
isda: 4477 occurrences

Top 5 most frequent GPE entity values:
houston: 32951 occurrences
california: 18752 occurrences
texas: 18535 occurrences
london: 8333 occurrences
us: 7570 occurrences


In [70]:
# # Step 4: Combine the top 5 DataFrames into one
# combined_df = pd.concat([top_dfs[entity_type] for entity_type in top_dfs], ignore_index=True)

# # Step 5: Add all count columns to the combined DataFrame
# for entity_type, _ in top_5_entity_types:
#     combined_df[f'{entity_type}_count'] = combined_df['index'].apply(
#         lambda idx: test.loc[idx, f'{entity_type}_count'] if idx in test.index else 0
#     )

In [5]:
# combined_df.head(1)

In [75]:
# combined_df.to_csv('TOP_NER.csv', index=False)

PERSON:      People, including fictional.
NORP:        Nationalities or religious or political groups.
FAC:         Buildings, airports, highways, bridges, etc.
ORG:         Companies, agencies, institutions, etc.
GPE:         Countries, cities, states.
LOC:         Non-GPE locations, mountain ranges, bodies of water.
PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
EVENT:       Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART: Titles of books, songs, etc.
LAW:         Named documents made into laws.
LANGUAGE:    Any named language.
DATE:        Absolute or relative dates or periods.
TIME:        Times smaller than a day.
PERCENT:     Percentage, including ”%“.
MONEY:       Monetary values, including unit.
QUANTITY:    Measurements, as of weight or distance.
ORDINAL:     “first”, “second”, etc.
CARDINAL:    Numerals that do not fall under another type.