Profiling using Regex : collect personal information (name,age,status)

In [1]:
# Import regex library
import re

In [2]:
text = '''
Born	Elon Reeve Musk
June 28, 1971 (age 52)
Pretoria, Transvaal, South Africa
Citizenship	
South Africa
Canada
United States
Education	University of Pennsylvania (BA, BS)
Title	
Founder, CEO, and chief engineer of SpaceX
CEO and product architect of Tesla, Inc.
Owner and CTO of X, formerly Twitter
President of the Musk Foundation
Founder of The Boring Company, X Corp., and xAI
Co-founder of Neuralink, OpenAI, Zip2, and X.com (part of PayPal)
'''

In [7]:
#extract age
pattern = 'age (\d*)'

matches = re.findall(pattern, text)
matches


['52']

In [6]:
#extract name
pattern = 'Born(.*)'

matches = re.findall(pattern, text)
matches[0].strip()


'Elon Reeve Musk'

In [8]:
#extract birthdate
pattern = 'Born.*\n(.*)\(age'

matches = re.findall(pattern, text)
matches

['June 28, 1971 ']

In [None]:
import pandas as pd
import wikipediaapi
from datetime import datetime

# Function to fetch information from Wikipedia
def get_wikipedia_info(name):
    wiki_wiki = wikipediaapi.Wikipedia('en')
    page_py = wiki_wiki.page(name)

    if not page_py.exists():
        return None

    # Extracting date of birth
    dob = None
    if 'birth_date' in page_py.text:
        dob_index = page_py.text.find('birth_date') + len('birth_date')
        dob = page_py.text[dob_index:dob_index + 50].split('\n')[0].strip()

    return {
        'name': name,
        'date_of_birth': dob,
        'age': calculate_age(dob)
    }

# Function to calculate age from date of birth
def calculate_age(dob):
    if dob:
        birth_date = datetime.strptime(dob, "%Y-%m-%d")
        today = datetime.today()
        age = today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
        return age
    return None

# Sample DataFrame
data = {'Name': ['Barack Obama', 'Albert Einstein', 'Ada Lovelace']}
df4 = pd.DataFrame(data)

# Add columns for extracted information
df4['Date_of_Birth'] = None
df4['Age'] = None

# Iterate through each row and fetch information from Wikipedia
for index, row in df4.iterrows():
    name = row['Name']
    info = get_wikipedia_info(name)
    if info:
        df4.at[index, 'Date_of_Birth'] = info['date_of_birth']
        df4.at[index, 'Age'] = info['age']

# Display the final DataFrame
print(df4)
