In [1]:
pip install spacy


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\HP\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import spacy
import re

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Define regular expressions
url_pattern = re.compile(r'https?://\S+|www\.\S+')

# Valid IP address pattern (0-255 range for each segment)
ip_address_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')

# Date pattern in YYYY-MM-DD format
date_pattern = re.compile(r'\b\d{4}-\d{2}-\d{2}\b')

# PAN Number pattern (5 uppercase letters, 4 digits, 1 uppercase letter)
pan_number_pattern = re.compile(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b')

def validate_ip(ip):
    """Validate if an IP address is within the 0-255 range for each segment."""
    segments = ip.split('.')
    return all(0 <= int(segment) <= 255 for segment in segments) and len(segments) == 4

def extract_entities(text):
    # Tokenize the text using spaCy
    doc = nlp(text)

    # Find entities using regular expressions
    urls = re.findall(url_pattern, text)
    ip_addresses = [ip for ip in re.findall(ip_address_pattern, text) if validate_ip(ip)]
    dates = re.findall(date_pattern, text)
    pan_numbers = re.findall(pan_number_pattern, text)

    # Extract spaCy entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    return {
        'urls': urls,
        'ip_addresses': ip_addresses,
        'dates': dates,
        'pan_numbers': pan_numbers,
        'spaCy_entities': entities
    }


# Example usage
text_data = """
Here is a sample text with a URL: https://www.Sample.com. 
Also, an IP address: 192.168.100.102. 
The date is 2023-01-01.
A PAN number is BBRPL4574H.
"""

results = extract_entities(text_data)

print("URLs:", results['urls'])
print("IP Addresses:", results['ip_addresses'])
print("Dates:", results['dates'])
print("PAN Numbers:", results['pan_numbers'])
print("Entities:", results['spaCy_entities'])


URLs: ['https://www.Sample.com.']
IP Addresses: ['192.168.100.102']
Dates: ['2023-01-01']
PAN Numbers: ['BBRPL4574H']
Entities: [('IP', 'ORG'), ('2023-01-01', 'DATE'), ('PAN', 'ORG')]
