In [2]:
import os
import PyPDF2
import pandas as pd

# Path to the directory containing PDFs
data_folder = r"D:\dataset\MSc DS"
data = []

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
        return text

def get_document_type_from_filename(filename):
    # Extract the document type from the filename
    base_name = os.path.splitext(filename)[0]  # Remove the file extension
    parts = base_name.split('_')
    if len(parts) > 1:
        return parts[0]  # Return the document type
    return "Unknown"

for file_name in os.listdir(data_folder):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(data_folder, file_name)
        extracted_text = extract_text_from_pdf(file_path)
        label = get_document_type_from_filename(file_name)
        data.append({"Document Filename": file_name, "Text": extracted_text, "Label": label})

df = pd.DataFrame(data)
df.to_csv("extracted_texts.csv", index=False)


In [68]:
import os
import pdfplumber
import pandas as pd

# Base folder path template with a placeholder for the last two digits
base_folder_template = 'D:/dataset/MSc DS/20320{}'

def extract_text_from_pdf(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ''
    return text

def extract_label_from_filename(filename):
    base_name = os.path.splitext(filename)[0]
    parts = base_name.split('_')
    if len(parts) > 1:
        return parts[0]  # Return the document type
    return "Unknown"

def process_folder(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(folder_path, file_name)
            extracted_text = extract_text_from_pdf(file_path)
            label = extract_label_from_filename(file_name)
            data.append({"Document Filename": file_name, "Text": extracted_text, "Label": label})
    return data

all_data = []

# Iterate through folders from 02 to 51, excluding folder 29
for i in range(2, 52):
    if i != 29 & 39:
        # Format the folder path with leading zeros for two digits
        folder_path = base_folder_template.format(f'{i:02}')
        if os.path.exists(folder_path):
            print(f'Processing folder: {folder_path}')
            folder_data = process_folder(folder_path)
            all_data.extend(folder_data)

# Create DataFrame and save to CSV
df = pd.DataFrame(all_data)
df.to_csv("extracted_texts.csv", index=False)


Processing folder: D:/dataset/MSc DS/2032002
Processing folder: D:/dataset/MSc DS/2032003
Processing folder: D:/dataset/MSc DS/2032004
Processing folder: D:/dataset/MSc DS/2032006
Processing folder: D:/dataset/MSc DS/2032007
Processing folder: D:/dataset/MSc DS/2032008
Processing folder: D:/dataset/MSc DS/2032009
Processing folder: D:/dataset/MSc DS/2032010
Processing folder: D:/dataset/MSc DS/2032011
Processing folder: D:/dataset/MSc DS/2032012
Processing folder: D:/dataset/MSc DS/2032013
Processing folder: D:/dataset/MSc DS/2032014
Processing folder: D:/dataset/MSc DS/2032015
Processing folder: D:/dataset/MSc DS/2032016
Processing folder: D:/dataset/MSc DS/2032017
Processing folder: D:/dataset/MSc DS/2032018
Processing folder: D:/dataset/MSc DS/2032019
Processing folder: D:/dataset/MSc DS/2032020
Processing folder: D:/dataset/MSc DS/2032021
Processing folder: D:/dataset/MSc DS/2032022
Processing folder: D:/dataset/MSc DS/2032023
Processing folder: D:/dataset/MSc DS/2032024
Processing

In [72]:
import os
import PyPDF2
import pandas as pd
from pdf2image import convert_from_path
import pytesseract

# Path to Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\Gokul01\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"  # Update this path if needed

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file, using OCR if necessary."""
    text = ""
    
    try:
        # Try extracting text directly
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ''
        
        # If text extraction fails or is empty, use OCR
        if not text.strip():
            images = convert_from_path(pdf_path)
            for image in images:
                text += pytesseract.image_to_string(image)
    
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
    
    return text

def extract_label_from_filename(filename):
    """Extract the label from the filename based on predefined patterns."""
    base_name = os.path.splitext(filename)[0]
    parts = base_name.split('_')
    if len(parts) > 1:
        return parts[0]  # Return the document type based on the filename pattern
    return "Unknown"

def process_folder(folder_path):
    """Process all PDFs in a given folder."""
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(folder_path, file_name)
            extracted_text = extract_text_from_pdf(file_path)
            label = extract_label_from_filename(file_name)
            data.append({"Document Filename": file_name, "Text": extracted_text, "Label": label})
    return data

def process_folders(base_folder_path):
    """Process all PDFs in the subfolders of the base folder."""
    all_data = []
    
    # Iterate through folders
    for i in range(2, 52):
        if i not in (29, 39):  # Exclude folders 29 and 39
            folder_path = os.path.join(base_folder_path, f'20320{i:02}')
            if os.path.exists(folder_path):
                print(f'Processing folder: {folder_path}')
                folder_data = process_folder(folder_path)
                all_data.extend(folder_data)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(all_data)
    csv_path = os.path.join(base_folder_path, "extracted_texts.csv")
    df.to_csv(csv_path, index=False)
    print(f"Data has been saved to {csv_path}")

if __name__ == "__main__":
    base_folder_path = r"D:\dataset\MSc DS"  # Replace with your directory path
    process_folders(base_folder_path)


Processing folder: D:\dataset\MSc DS\2032002
Processing folder: D:\dataset\MSc DS\2032003
Error processing D:\dataset\MSc DS\2032003\pan_card_2032003.pdf: Unable to get page count. Is poppler installed and in PATH?
Processing folder: D:\dataset\MSc DS\2032004
Error processing D:\dataset\MSc DS\2032004\aadhar_back_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?


Multiple definitions in dictionary at byte 0x87ee9 for key /Info
Multiple definitions in dictionary at byte 0x87ef5 for key /Info
Multiple definitions in dictionary at byte 0x87f01 for key /Info
Multiple definitions in dictionary at byte 0x8aa6c for key /Info


Error processing D:\dataset\MSc DS\2032004\birth_certificate_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Error processing D:\dataset\MSc DS\2032004\id_card_back_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Error processing D:\dataset\MSc DS\2032004\id_card_front_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Error processing D:\dataset\MSc DS\2032004\mark_sheet_1_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?


Multiple definitions in dictionary at byte 0x8aa78 for key /Info
Multiple definitions in dictionary at byte 0x8aa84 for key /Info
Multiple definitions in dictionary at byte 0x8aa6c for key /Info
Multiple definitions in dictionary at byte 0x8aa78 for key /Info
Multiple definitions in dictionary at byte 0x8aa84 for key /Info


Error processing D:\dataset\MSc DS\2032004\mark_sheet_2_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Error processing D:\dataset\MSc DS\2032004\mark_sheet_3_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Error processing D:\dataset\MSc DS\2032004\mark_sheet_4_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Error processing D:\dataset\MSc DS\2032004\mark_sheet_5_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Error processing D:\dataset\MSc DS\2032004\mark_sheet_6_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Error processing D:\dataset\MSc DS\2032004\mark_sheet_8_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Error processing D:\dataset\MSc DS\2032004\mark_sheet_9_2032004.pdf: Unable to get page count. Is poppler installed and in PATH?
Processing folder: D:\dataset\MSc DS\2032005
Processing folder: D:\dataset\MSc DS\2032006
Error p

In [65]:
pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.3-py3-none-any.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     -------------------------------------- 42.0/42.0 kB 992.3 kB/s eta 0:00:00
Collecting pdfminer.six==20231228 (from pdfplumber)
  Using cached pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Using cached pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Downloading pdfplumber-0.11.3-py3-none-any.whl (59 kB)
   ---------------------------------------- 0.0/59.2 kB ? eta -:--:--
   ---------------------------------------- 59.2/59.2 kB 1.6 MB/s eta 0:00:00
Using cached pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
Using cached pypdfium2-4.30.0-py3-none-win_amd64.whl (2.9 MB)
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20231228 pdfplumber-0.11.3 pypdfium2-4.30.0
Note: you may need to restart the kernel to 


[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [63]:
import os

# Define the folder path
folder_path = 'D:/dataset/MSc DS/2032051'

# Extract the folder number from the folder name
folder_number = os.path.basename(folder_path)

# List all files in the folder
files = os.listdir(folder_path)

# Rename each file by appending the folder number to the filename
for file_name in files:
    # Construct the old file path
    old_file_path = os.path.join(folder_path, file_name)
    
    # Split the file name and extension
    name, ext = os.path.splitext(file_name)
    
    # Create the new file name with the folder number appended
    new_file_name = f"{name}_{folder_number}{ext}"
    
    # Construct the new file path
    new_file_path = os.path.join(folder_path, new_file_name)
    
    # Rename the file
    os.rename(old_file_path, new_file_path)

print("Files have been renamed successfully.")


Files have been renamed successfully.


In [69]:
import os
import pdfplumber
import pandas as pd

# Specify the path to the folder containing resumes
folder_path = r"D:\dataset\RESUMES  (File responses)"

def extract_text_from_pdf(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ''
    return text

def extract_label_from_filename(filename):
    base_name = os.path.splitext(filename)[0]
    # Extract the label based on your specific filename format
    parts = base_name.split('_')
    if len(parts) > 1:
        return parts[0]  # Return the document type
    return "Unknown"

def process_folder(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.pdf'):
            file_path = os.path.join(folder_path, file_name)
            extracted_text = extract_text_from_pdf(file_path)
            label = extract_label_from_filename(file_name)
            data.append({"Document Filename": file_name, "Text": extracted_text, "Label": label})
    return data

# Process the specified folder
all_data = process_folder(folder_path)

# Create DataFrame and save to CSV
df = pd.DataFrame(all_data)
df.to_csv("resume.csv", index=False)


In [71]:
import pandas as pd

# Paths to the CSV files
csv_file1 = r"D:\Mini Projects\AI_Legal_Document_Analyzer\data\extracted_texts.csv"
csv_file2 = r"D:\Mini Projects\AI_Legal_Document_Analyzer\data\resume.csv"

# Load the CSV files into DataFrames
df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)

# Concatenate the DataFrames
merged_df = pd.concat([df1, df2], ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_file.csv', index=False)
