# Document Classification - Data Extraction
## Extract text from Word and PDF documents

In [1]:
# !pip install docx

In [2]:
# import os
# import zipfile
# import warnings

# import pandas as pd
# import docx
# import PyPDF2

# warnings.filterwarnings('ignore')

# # =============================
# # 1. UNZIP THE DATASET
# # =============================

# # Path to your ZIP file
# # If you're running locally and the zip is in the same folder as your script/notebook:
# # zip_path = "P608-Dataset.zip"
# zip_path = "P608-Dataset.zip"   # <-- change this if needed

# # Where to extract
# extract_to = "."   # current directory, or give a folder path like "data"

# # Extract the zip
# with zipfile.ZipFile(zip_path, 'r') as zf:
#     zf.extractall(extract_to)

# print("Zip extracted successfully!")

# # After extraction, we expect something like: ./P608-Dataset/Resumes_Docx/...
# base_folder = os.path.join(extract_to, "P608-Dataset", "Resumes_Docx")

# print("Base folder for resumes:", base_folder)
# print("Subfolders:", os.listdir(base_folder))


# # =============================
# # 2. HELPER FUNCTIONS
# # =============================

# def extract_text_from_docx(file_path):
#     """Extract text from Word document (.docx)"""
#     try:
#         doc = docx.Document(file_path)
#         text = '\n'.join([para.text for para in doc.paragraphs])
#         return text
#     except Exception as e:
#         print(f"Error reading {file_path}: {e}")
#         return ""


# def extract_text_from_pdf(file_path):
#     """Extract text from PDF document"""
#     try:
#         with open(file_path, 'rb') as file:
#             pdf_reader = PyPDF2.PdfReader(file)
#             text = ''
#             for page in pdf_reader.pages:
#                 page_text = page.extract_text()
#                 if page_text:
#                     text += page_text
#         return text
#     except Exception as e:
#         print(f"Error reading {file_path}: {e}")
#         return ""


# # =============================
# # 3. MAIN LOADER FUNCTION
# # =============================

# def load_documents(base_path):
#     """
#     Load all documents from category folders and create a dataset.
#     base_path -> folder that contains 'Peoplesoft', 'React Developer', etc.
#     """
#     data = []
    
#     # Map folder names to category labels
#     categories = {
#         'Peoplesoft': 'Peoplesoft',
#         'Peoplesoft Resume': 'Peoplesoft',
#         'React Developer': 'React Developer',
#         'SQL Developer': 'SQL Developer',
#         'workday': 'Workday'
#     }
    
#     for folder_name, category in categories.items():
#         folder_path = os.path.join(base_path, folder_name)
        
#         if not os.path.exists(folder_path):
#             print(f"Folder not found, skipping: {folder_path}")
#             continue
            
#         print(f"\nProcessing folder: {folder_name} -> category: {category}")
        
#         for filename in os.listdir(folder_path):
#             file_path = os.path.join(folder_path, filename)
            
#             # Skip if it's a directory
#             if os.path.isdir(file_path):
#                 continue
            
#             # Read text based on file extension
#             if filename.lower().endswith(('.docx', '.doc')):
#                 text = extract_text_from_docx(file_path)
#             elif filename.lower().endswith('.pdf'):
#                 text = extract_text_from_pdf(file_path)
#             else:
#                 # Skip other file types
#                 print(f"Skipping unsupported file: {file_path}")
#                 continue
            
#             # Only add non-empty text
#             if text.strip():
#                 data.append({
#                     'filename': filename,
#                     'category': category,
#                     'text': text,
#                     'text_length': len(text)
#                 })
#             else:
#                 print(f"No text extracted from: {file_path}")
    
#     # Convert to DataFrame
#     df = pd.DataFrame(data)
#     return df


# # =============================
# # 4. RUN EVERYTHING
# # =============================

# df = load_documents(base_folder)

# print("\nSample rows:")
# print(df.head())

# print("\nTotal resumes loaded:", len(df))

# # (Optional) Save to CSV to reuse later
# df.to_csv("resumes_dataset.csv", index=False)
# print("\nSaved dataset to 'resumes_dataset.csv'")


In [3]:
# !pip install PyPDF2

In [4]:
import zipfile
import pandas as pd
from docx import Document      # from python-docx
import PyPDF2
import warnings

warnings.filterwarnings('ignore')

# ---------- Helpers to extract text ----------

def extract_text_from_docx_file(file_obj):
    """
    Extract text from a DOCX file stored inside a ZIP.
    file_obj is a file-like object (from z.open()).
    """
    try:
        doc = Document(file_obj)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text
    except Exception as e:
        print(f"Error reading DOCX: {e}")
        return ""


def extract_text_from_pdf_file(file_obj):
    """
    Extract text from a PDF file stored inside a ZIP.
    file_obj is a file-like object (from z.open()).
    """
    try:
        reader = PyPDF2.PdfReader(file_obj)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""


# ---------- Main loader from ZIP ----------

def load_documents_from_zip(zip_path='P608-Dataset.zip'):
    """
    Load all DOCX/PDF resumes from inside the ZIP and
    return them as a pandas DataFrame.
    """
    # Map folder names to category labels
    categories = {
        'Peoplesoft': 'Peoplesoft',
        'Peoplesoft Resume': 'Peoplesoft',
        'React Developer': 'React Developer',
        'SQL Developer': 'SQL Developer',
        'workday': 'Workday'
    }

    data = []

    with zipfile.ZipFile(zip_path, 'r') as z:
        for file_name in z.namelist():
            # Skip folders
            if file_name.endswith('/'):
                continue

            # We only care about DOCX and PDF
            lower_name = file_name.lower()
            if not (lower_name.endswith('.docx') or lower_name.endswith('.pdf')):
                continue

            # Find which folder/category this file belongs to
            parts = file_name.split('/')
            folder_name = None
            for part in parts:
                if part in categories:
                    folder_name = part
                    break

            if folder_name is None:
                # File not in any of our category folders
                continue

            category = categories[folder_name]

            # Read file content from inside ZIP
            with z.open(file_name) as f:
                if lower_name.endswith('.docx'):
                    text = extract_text_from_docx_file(f)
                else:  # PDF
                    text = extract_text_from_pdf_file(f)

            # Skip completely empty text
            if text and text.strip():
                data.append({
                    'filename': file_name,
                    'category': category,
                    'text': text,
                    'text_length': len(text)
                })

    return pd.DataFrame(data)





  from pandas.core import (


In [5]:
# pip install python-docx


In [6]:
# ---------- Run the loader & show summary ----------

df = load_documents_from_zip('P608-Dataset.zip')

print(f"Total documents loaded: {len(df)}")
print("\nCategory distribution:")
if not df.empty:
    print(df['category'].value_counts())
else:
    print("No documents found. Check ZIP structure or category folder names.")



Error reading DOCX: "no relationship of type 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument' in collection"


PdfReader stream/file object is not in binary mode. It may not be read correctly.


Total documents loaded: 53

Category distribution:
category
React Developer    21
SQL Developer      11
Workday            11
Peoplesoft         10
Name: count, dtype: int64


In [7]:
df.head()

Unnamed: 0,filename,category,text,text_length
0,P608-Dataset/Resumes_Docx/Peoplesoft Resume/Pe...,Peoplesoft,Anubhav Kumar Singh\t\t\n\n To work in a gl...,7256
1,P608-Dataset/Resumes_Docx/Peoplesoft Resume/Pe...,Peoplesoft,Murali\n\nExperience Summary \n\nI have 6 year...,3981
2,P608-Dataset/Resumes_Docx/Peoplesoft Resume/Pe...,Peoplesoft,\n\n\n\n\n\n\n\n\n\n\n\n\n\nPROFILE SUMMARY\n\...,3646
3,P608-Dataset/Resumes_Docx/Peoplesoft Resume/Pe...,Peoplesoft,PeopleSoft Admin\nVARKALA VIKAS\n\nCareer Obj...,7265
4,P608-Dataset/Resumes_Docx/Peoplesoft Resume/Pe...,Peoplesoft,PeopleSoft Administration\n \nVivekanand Sayan...,15339


In [8]:
# Optional: save to CSV
df.to_csv('extracted_documents.csv', index=False)
print("\nData saved to extracted_documents.csv")


Data saved to extracted_documents.csv
