In [1]:
pip install pandas PyPDF2 python-docx pdfplumber


Defaulting to user installation because normal site-packages is not writeable
Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Using cached python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfplumber
  Using cached pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Using cached pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Using cached pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Using cached python_docx-1.1.2-py3-none-any.whl (244 kB)
Using cached pdfplumber-0.11.4-py3-none-any.whl (59 kB)
Using cached pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
Using cached pypdfium2-4.30.0-py3-none-win_amd64.whl (2.9 MB)
Installing collected packages: python-docx, pypdfium2, PyPDF2, pdfminer.six, pdfplumber
Successfully installed PyPDF2-3.0.1 pdfmin



In [1]:
import os
import re
import pandas as pd
import PyPDF2
import docx
import pdfplumber

In [2]:
# Expected Matches:

# Indian Numbers:

# (+91) 98765-54322 → Matches
# +91-7838166910 → Matches
# +91 9876543210 → Matches
# +919876543210 → Matches
# 9570298107 → Matches
# 8851649905 → Matches
# 09876 543210 → Matches

# US Numbers:

# +1 (555) 123-4567 → Matches
# +1 555-123-4567 → Matches
# +1 555 123 4567 → Matches
# +1 5551234567 → Matches
# +1(555)1234567 → Matches
# +1(555) 1234567 → Matches
# +1 555-1234567 → Matches
# +1-555-123-4567 → Matches
# 555-123-4567 → Matches
# 5551234567 → Matches

In [5]:
# Regex pattern for India and USA phone numbers
#phone_pattern = r'\+?\d[\d\s.-]{8,}\d'
phone_pattern = r'(\(\+91\)\s?\d{5}-\d{5})|(\+91-\d{10})|(\+91\s?\d{10})|(\+91\d{10})|(\d{10})|(\+1\s\(\d{3}\)\s\d{3}-\d{4})|(\+1\s\d{3}-\d{3}-\d{4})|(\+1\s\d{3}\s\d{3}-\d{4})|(\+1\s\d{3}\s\d{4})|(\+1-\d{3}-\d{3}-\d{4})|(\d{3}-\d{3}-\d{4})|(\d{3}\s\d{3}-\d{4})|(\+1\s\d{3}\d{7})|(\+1\(\d{3}\)\d{7})|(\+1\(\d{3}\)\s\d{7})|(\+1\(\d{3}\)\s\d{3}-\d{4})|(\+1\(\d{3}\)\s\d{3}\d{4})|(\+1\s\d{3}-\d{3}\d{4})'
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}'

In [9]:

# Function to extract text from PDF files
def extract_text_from_pdf(file_path):
    text = ''
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
    return text

# Function to extract text from Word files
def extract_text_from_word(file_path):
    doc = docx.Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

# Function to extract name from file content
def extract_name_from_text(text):
    # Find the first two words that might be the name (may need tuning)
    name = re.split(r'\n|\s+', text.strip())[:2]
    return ' '.join(name)

def extract_name_from_filename(file_name):
    # Remove the file extension and split by special characters or spaces
    name_part = re.sub(r'[_-]', ' ', os.path.splitext(file_name)[0])  # Replace underscores and dashes with space
    name_part = re.sub(r'\W+', ' ', name_part)  # Remove other special characters
    name_part = re.sub(r'\b(resume|cv|curriculum vitae)\b', '', name_part, flags=re.IGNORECASE)  # Remove keywords
    name_part = re.sub(r'\s+', ' ', name_part).strip()  # Remove extra spaces
    return name_part

# Function to extract emails and phone numbers from text
def extract_contact_info(text):
    emails = re.findall(email_pattern, text)
    
    # Extract phone numbers using the updated phone pattern
    phone_matches = re.findall(phone_pattern, text)
    # Flatten the tuples returned by findall
    phones = [match[0] or match[1] or match[2] or match[3] or match[4] or match[5] or match[6] or match[7] or match[8] or match[9] or match[10] or match[11] or match[12] or match[13] for match in phone_matches]
    
    return emails, phones

# Main function to extract data from files in a folder
def extract_info_from_folder(folder_path):
    extracted_data = []
    for file_name in os.listdir(folder_path):
        # Skip temporary files that start with '~$'
        if file_name.startswith("~$"):
            continue
        
        file_path = os.path.join(folder_path, file_name)

        # Read the document text 
        if file_name.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif file_name.endswith(".docx"):
            text = extract_text_from_word(file_path)
        else:
            continue
        
        emails, phones = extract_contact_info(text)
        name_from_text = extract_name_from_text(text)  # Extract name from text
        name_from_file = extract_name_from_filename(file_name)  # Extract name from filename
        
        extracted_data.append({
            'Name from Text': name_from_text,
            'Name from File': name_from_file,
            'Emails': ', '.join(emails),
            'Phone Numbers': ', '.join(phones)
        })
    
    return pd.DataFrame(extracted_data)


In [11]:
# Example usage
folder_path = 'C:\\Users\\Reena Sharma\\llama_index\\data'  # Update with your folder path
df = extract_info_from_folder(folder_path)

In [13]:
# Save the extracted data to a CSV file
output_file = 'extracted_contact_info.csv'
df.to_csv(output_file, index=False)
print(f"Extracted data saved to {output_file}")

Extracted data saved to extracted_contact_info.csv
