#  Named Entity Recognition (NER) with SpaCy

### Install and set up SpaCy

In [1]:
!pip install tika spacy pandas numpy requests
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import pandas as pd
import spacy
import requests
from tika import parser

In [3]:
# Load the dataset
df = pd.read_csv("../data/haunted_religious_historic_crime_merged.tsv", sep='\t')

# Display the first few rows
df.head()

Unnamed: 0.1,Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,...,Adherents,Adherents as % of Population,Haunted Houses Count per County,crime_rate_per_100000,MURDER,ROBBERY,BURGLRY,nearest_historical_place,num_historical_places_5mi,year_of_nearest_historical_place
0,0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,...,282420.0,0.429227,46.0,395.689239,22.0,639.0,3878.0,Ada Covered Bridge,2,1970.0
1,1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,...,11128.0,0.243256,3.0,190.88523,0.0,2.0,180.0,,0,
2,2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,...,33427.0,0.33621,3.0,205.670041,4.0,22.0,335.0,Lenawee County Courthouse,12,1991.0
3,3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,...,33427.0,0.33621,3.0,205.670041,4.0,22.0,335.0,Dennis and State Streets Historic District (Bo...,12,1979.0
4,4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,...,43340.0,0.322686,19.0,602.168696,11.0,112.0,1405.0,Superior Street Commercial Historic District,2,1997.0


### Load the pre-trained SpaCy model for English NLP tasks 

In [8]:
nlp = spacy.load("en_core_web_sm") #load the pre-trained SpaCy model for English NLP tasks 

In [10]:
def extract_named_entities(text):
    if pd.isna(text):  # Handle missing values
        return []
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['Named_Entities'] = df['description'].apply(lambda x: extract_named_entities(x)) #apply the extracted named entities and save to a new column 

In [12]:
df['Named_Entities']

0        [(Ada witch -, PERSON), (3-mile, QUANTITY), (t...
1                  [(month later, DATE), (this day, DATE)]
2        [(Gorman Rd, PERSON), (Sand Creek, FAC), (A mi...
3        [(1970, DATE), (one, CARDINAL), (211, CARDINAL...
4        [(Kappa Delta Sorority - The Kappa Delta Soror...
                               ...                        
10969    [(12 midnight, TIME), (two, CARDINAL), (Sherid...
10970                                  [(years ago, DATE)]
10971    [(18 years old, DATE), (70, DATE), (one, CARDI...
10972                                                   []
10973     [(the Air Force Academy, ORG), (Survival, NORP)]
Name: Named_Entities, Length: 10974, dtype: object

In [16]:
print(df[["description", "Named_Entities"]].head())

                                         description  \
0  Ada witch - Sometimes you can see a misty blue...   
1  A little girl was killed suddenly while waitin...   
2  If you take Gorman Rd. west towards Sand Creek...   
3  In the 1970's, one room, room 211, in the old ...   
4  Kappa Delta Sorority - The Kappa Delta Sororit...   

                                      Named_Entities  
0  [(Ada witch -, PERSON), (3-mile, QUANTITY), (t...  
1            [(month later, DATE), (this day, DATE)]  
2  [(Gorman Rd, PERSON), (Sand Creek, FAC), (A mi...  
3  [(1970, DATE), (one, CARDINAL), (211, CARDINAL...  
4  [(Kappa Delta Sorority - The Kappa Delta Soror...  


### Save it to a new tsv file

In [20]:
df.to_csv("../Data/merged_data_v2_with_entities.tsv", sep="\t", index=False)