### Handling Unstructured Data with Python
**Description**: Extract structured data from unstructured text using Python.

**Steps**:
1. Load and analyze an unstructured text document.
2. Extract information using regex.

In [7]:
import re
import pandas as pd

# Sample unstructured text
unstructured_text = """
Order ID: 23984
Customer: John Doe
Email: john.doe@example.com
Date: April 4, 2024
Product: Wireless Mouse - Model WXM123
Feedback: The mouse works great, but sometimes the scroll wheel gets stuck.
Contact: +1-234-567-8900

Order ID: 23985
Customer: Alice Johnson
Email: alice.j@example.com
Date: April 5, 2024
Product: Mechanical Keyboard - Model KBD456
Feedback: Keys are very responsive. Love the RGB lights!
Contact: +1-678-910-1122
"""

# Function to validate extracted fields
def validate_field(field_name, value):
    if field_name == "Email":
        return re.match(r"[^@]+@[^@]+\.[^@]+", value) is not None
    if field_name == "Contact":
        return re.match(r"\+1-\d{3}-\d{3}-\d{4}", value) is not None
    return value is not None and value.strip() != ""

# Main function to extract structured data
def extract_structured_data(text_block):
    pattern_dict = {
        "Order ID": r"Order ID:\s*(\d+)",
        "Customer": r"Customer:\s*(.+)",
        "Email": r"Email:\s*(.+)",
        "Date": r"Date:\s*(.+)",
        "Product": r"Product:\s*(.+)",
        "Feedback": r"Feedback:\s*(.+)",
        "Contact": r"Contact:\s*(.+)",
    }
    
    orders = text_block.strip().split("\n\n")
    extracted_data = []

    for order in orders:
        order_data = {}
        for field, pattern in pattern_dict.items():
            try:
                match = re.search(pattern, order)
                value = match.group(1).strip() if match else None
                # Validate the field
                if not validate_field(field, value):
                    value = None
                order_data[field] = value
            except Exception as e:
                order_data[field] = None
        extracted_data.append(order_data)

    return pd.DataFrame(extracted_data)

# Run the function and display the result
df = extract_structured_data(unstructured_text)
df



# Mini test suite to validate behavior
def run_tests():
    test1 = df.shape[0] == 2
    test2 = df["Email"].notnull().all()
    test3 = df["Contact"].str.match(r"\+1-\d{3}-\d{3}-\d{4}").all()
    test4 = df["Order ID"].notnull().all()
    
    print("Test 1 - Correct number of records:", "Pass" if test1 else "Fail")
    print("Test 2 - All emails valid:", "Pass" if test2 else "Fail")
    print("Test 3 - All contacts valid:", "Pass" if test3 else "Fail")
    print("Test 4 - All orders have IDs:", "Pass" if test4 else "Fail")

run_tests()


Test 1 - Correct number of records: Pass
Test 2 - All emails valid: Pass
Test 3 - All contacts valid: Pass
Test 4 - All orders have IDs: Pass
