<a href="https://colab.research.google.com/github/Richik06/Richik06/blob/main/INVOICE_CATEGORIZER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import joblib

# Sample training data
data = {
    "text": [
        "Zomato order chicken biryani",
        "Swiggy delivery paneer tikka",
        "Uber ride to airport",
        "Ola cab ride",
        "Indigo flight to Mumbai",
        "BSNL broadband bill",
        "Electricity bill",
        "Amazon purchase headphone",
        "Flipkart shopping mobile",
        "Recharge Airtel prepaid",
        "Dominos pizza order",
        "Train ticket IRCTC",
        "Metro smartcard recharge",
        "Gas cylinder BharatGas"
    ],
    "category": [
        "Food", "Food", "Travel", "Travel", "Travel", "Utilities", "Utilities",
        "Shopping", "Shopping", "Utilities", "Food", "Travel", "Travel", "Utilities"
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 1: Text vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])  # Features
y = df['category']                        # Labels

# Step 2: Train classifier
model = MultinomialNB()
model.fit(X, y)

# Step 3: Save model and vectorizer
joblib.dump(model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("✅ Model and vectorizer saved as 'model.pkl' and 'vectorizer.pkl'")


✅ Model and vectorizer saved as 'model.pkl' and 'vectorizer.pkl'


In [5]:
import streamlit as st
import pandas as pd
import easyocr
import joblib
import tempfile
import os
from pdf2image import convert_from_path
from PIL import Image

# ---------- Load model and vectorizer ----------
@st.cache_resource
def load_model():
    model = joblib.load("model.pkl")
    vectorizer = joblib.load("vectorizer.pkl")
    return model, vectorizer

model, vectorizer = load_model()

# ---------- OCR Function ----------
def run_ocr(file_path):
    reader = easyocr.Reader(['en'])
    text_lines = reader.readtext(file_path, detail=0)
    return text_lines

# ---------- File Upload ----------
st.title("🧾 Invoice/Bill Digitizer & Categorizer")
st.write("Upload your invoice image or PDF to auto-categorize the line items into Food, Travel, Utilities, etc.")

uploaded_file = st.file_uploader("Upload Invoice", type=["jpg", "jpeg", "png", "pdf"])

if uploaded_file:
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        tmp_file.write(uploaded_file.read())
        temp_path = tmp_file.name

    # Convert PDF to image
    if uploaded_file.name.lower().endswith(".pdf"):
        pages = convert_from_path(temp_path)
        image_path = temp_path + ".jpg"
        pages[0].save(image_path, 'JPEG')
    else:
        image_path = temp_path

    # ---------- OCR + Prediction ----------
    st.info("🔍 Running OCR on your invoice...")
    ocr_lines = run_ocr(image_path)

    if len(ocr_lines) == 0:
        st.warning("No text found in invoice.")
    else:
        st.success("✅ Text successfully extracted from invoice.")

        # Predict categories
        X_test = vectorizer.transform(ocr_lines)
        preds = model.predict(X_test)

        # Combine results
        df = pd.DataFrame({"Line Item": ocr_lines, "Predicted Category": preds})
        st.dataframe(df)

        # Downloadable CSV
        csv = df.to_csv(index=False).encode('utf-8')
        st.download_button(
            label="📥 Download Categorized Data as CSV",
            data=csv,
            file_name='categorized_invoice.csv',
            mime='text/csv',
        )




In [2]:
%pip install streamlit pandas easyocr joblib pdf2image Pillow

Collecting streamlit
  Downloading streamlit-1.47.0-py3-none-any.whl.metadata (9.0 kB)
Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m394.5 kB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86

In [6]:
import os

# List the saved files
os.listdir()


['.config', 'model.pkl', 'vectorizer.pkl', 'sample_data']

In [7]:
print("✅ Model and vectorizer saved!")
print("📁 Files in directory:", os.listdir())


✅ Model and vectorizer saved!
📁 Files in directory: ['.config', 'model.pkl', 'vectorizer.pkl', 'sample_data']


In [8]:
from google.colab import files
files.download("model.pkl")
files.download("vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>