In [186]:
import pandas as pd
import re
from pathlib import Path
from docx import Document
import PyPDF2

# Initialize the DataFrame
df = pd.DataFrame(columns=["Name", "Number", "Mail"])

def extract_text_from_pdf(pdf_file):
    """Extract text from a PDF file using PyPDF2."""
    text = ""
    with open(pdf_file, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text += page.extract_text()  # Extract text from each page
    return text

def extract_info(folder_name):
    global df
    # Get all .docx and .pdf files in the folder
    files = [f for f in Path(folder_name).iterdir() if f.is_file() and f.suffix in ['.docx', '.pdf']]
    
    for file in files:
        if file.suffix == '.docx':
            # Open the .docx file
            doc = Document(file)
            full_text = []
            
            # Collect text from tables
            for table in doc.tables:
                for row in table.rows:
                    for cell in row.cells:
                        full_text.append(cell.text)
            
            # Collect text from headers
            for section in doc.sections:
                for para in section.header.paragraphs:
                    full_text.append(para.text)
            
            # Collect text from footers
            for section in doc.sections:
                for para in section.footer.paragraphs:
                    full_text.append(para.text)
            
            # Collect text from inline shapes (if any)
            for shape in doc.inline_shapes:
                if hasattr(shape, 'text_frame') and shape.text_frame:
                    for para in shape.text_frame.paragraphs:
                        full_text.append(para.text)
            
            # Collect all paragraphs
            full_text.extend([para.text for para in doc.paragraphs])
            
            # Combine all text into a single string for easier processing
            full_text_combined = "\n".join(full_text)
        
        elif file.suffix == '.pdf':
            # Extract text from PDF file
            full_text_combined = extract_text_from_pdf(file)
        
        # Regular expression patterns for phone number and email
        phone_pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'  # Updated pattern to match (123) 456-7890
        email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
        
        # Extracting potential matches
        phone_numbers = re.findall(phone_pattern, full_text_combined)
        emails = re.findall(email_pattern, full_text_combined)
        
        # Assuming the name is the first non-empty line of the document
        lines = [line.strip() for line in full_text_combined.splitlines() if line.strip()]
        name = lines[0] if lines else "Name not found"
        
        # Store the extracted info in the DataFrame
        current_df = pd.DataFrame([[name, phone_numbers[0], emails[0]]], columns=["Name", "Number", "Mail"])
        df = pd.concat([df, current_df], ignore_index=True)
    
    # Print the final DataFrame
    df['Number'] = df['Number'].str.replace(r"[()\s\-]", "", regex=True)
    df.to_csv('output.csv')
    print(df)

# give folder name
extract_info("cvs")

             Name      Number                  Mail
0  Rakshit Bhatia  9999999999     sample@sample.com
1       Your Name  1234567890  no_reply@example.com
2            Your  1234567890  no_reply@example.com
