In [None]:
import re
import pandas as pd
import spacy

# Load English tokenizer, POS tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Read unstructured text from a file
with open('input.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Process the text with spaCy
doc = nlp(text)

# Extract named entities
entities = [(ent.text, ent.label_) for ent in doc.ents]

# Extract specific patterns using regular expressions
emails = re.findall(r'\b[\w.-]+?@\w+?\.\w+?\b', text)
phone_numbers = re.findall(r'\b\d{10}\b', text)  # Adjust pattern as needed

# Create a DataFrame to store the structured data
data = {
    'Entity': [ent[0] for ent in entities],
    'Label': [ent[1] for ent in entities]
}

# Add emails and phone numbers to the DataFrame
# This is a simplistic approach; in practice, you might want to handle these separately
data['Email'] = emails + [''] * (len(entities) - len(emails))
data['Phone'] = phone_numbers + [''] * (len(entities) - len(phone_numbers))

df = pd.DataFrame(data)

# Save the structured data to a CSV file
df.to_csv('structured_output.csv', index=False)

print("Structured data has been extracted and saved to 'structured_output.csv'.")