In [19]:
import os
import fitz
import pandas as pd
from PIL import Image
from docx import Document

import pytesseract 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import joblib

# ✅ Set path to Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# 📄 Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return " ".join(page.get_text() for page in doc)

# 🖼️ Extract text from image using pytesseract
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    return pytesseract.image_to_string(image)

# 📃 Extract text from DOCX
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

# 🧠 Smart file handler
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.pdf':
        return extract_text_from_pdf(file_path)
    elif ext in ['.jpg', '.jpeg', '.png']:
        return extract_text_from_image(file_path)
    elif ext == '.docx':
        return extract_text_from_docx(file_path)
    else:
        print(f"⚠️ Unsupported file type: {ext}")
        return ""

# 📂 Load and label documents
def load_documents(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        text = extract_text_from_file(file_path)
        if text.strip():  # Only keep non-empty docs
            label = filename.replace(os.path.splitext(filename)[1], "").replace("_", " ").lower()
            data.append((text, label))
    return pd.DataFrame(data, columns=["text", "label"])

# 🚀 Load data
df = load_documents("./my_docs/")
print(df.head())

# 🔍 Train-test split
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# 🤖 Model pipeline
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

# 📊 Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# 💾 Save model
joblib.dump(model, "customs_doc_classifier.pkl")


⚠️ Unsupported file type: 
                                                text                     label
0  Dats BILL OF LADING Page 1 of\n\nName Bill of ...            bill of lading
1  6\nDate:\nBILL OF LADING\nPage 1 of ______\nSH...            bill of lading
2  Certificate of Origin Template\n\nCertificate ...  certificate-of-origin-eg
3   \n \nAttachment 3-3-1 \nOriginal /Duplicate \...     certificate of origin
4   \n \n \nIf there is a designated broker for t...        commercial invoice
                       precision    recall  f1-score   support

       bill of lading       0.00      0.00      0.00         1
certificate of origin       0.00      0.00      0.00         0
         packing list       1.00      1.00      1.00         1

             accuracy                           0.50         2
            macro avg       0.33      0.33      0.33         2
         weighted avg       0.50      0.50      0.50         2



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['customs_doc_classifier.pkl']