<a href="https://colab.research.google.com/github/Richik06/Richik06/blob/main/INVOICE_CATEGORIZER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import joblib

In [13]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("cleaned_final_dataset.csv")

# Rename columns
df.columns = ["text", "category"]

# Optional: check
print(df.head())
print(df.columns)

                                                text       category
0  Zomato order #3452, Tandoori momos and cold dr...           Food
1                 Uber ride from work to home $12.50         Travel
2             BSNL Broadband monthly bill - Aug ₹750      Utilities
3  PVR Cinemas: ticket for Oppenheimer, 2 adults ...  Entertainment
4  Apollo Pharmacy: Paracetamol & Cough Syrup $15.75         Health
Index(['text', 'category'], dtype='object')


In [14]:
# ✅ Step 2: Clean if needed (optional, only if required)
df = df.dropna(subset=["text", "category"])  # in case there are NaNs

# ✅ Step 3: Vectorize and train
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])  # Features
y = df['category']                        # Labels

model = MultinomialNB()
model.fit(X, y)

# ✅ Step 4: Save model and vectorizer
joblib.dump(model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("✅ New model trained and saved with large dataset")

✅ New model trained and saved with large dataset


In [15]:
import streamlit as st
import pandas as pd
import easyocr
import joblib
import tempfile
import os
from pdf2image import convert_from_path
from PIL import Image

# ---------- Load model and vectorizer ----------
@st.cache_resource
def load_model():
    model = joblib.load("model.pkl")
    vectorizer = joblib.load("vectorizer.pkl")
    return model, vectorizer

model, vectorizer = load_model()

# ---------- OCR Function ----------
def run_ocr(file_path):
    reader = easyocr.Reader(['en'])
    text_lines = reader.readtext(file_path, detail=0)
    return text_lines

# ---------- File Upload ----------
st.title("🧾 Invoice/Bill Digitizer & Categorizer")
st.write("Upload your invoice image or PDF to auto-categorize the line items into Food, Travel, Utilities, etc.")

uploaded_file = st.file_uploader("Upload Invoice", type=["jpg", "jpeg", "png", "pdf"])

if uploaded_file:
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        tmp_file.write(uploaded_file.read())
        temp_path = tmp_file.name

    # Convert PDF to image
    if uploaded_file.name.lower().endswith(".pdf"):
        pages = convert_from_path(temp_path)
        image_path = temp_path + ".jpg"
        pages[0].save(image_path, 'JPEG')
    else:
        image_path = temp_path

    # ---------- OCR + Prediction ----------
    st.info("🔍 Running OCR on your invoice...")
    ocr_lines = run_ocr(image_path)

    if len(ocr_lines) == 0:
        st.warning("No text found in invoice.")
    else:
        st.success("✅ Text successfully extracted from invoice.")

        # Predict categories
        X_test = vectorizer.transform(ocr_lines)
        preds = model.predict(X_test)

        # Combine results
        df = pd.DataFrame({"Line Item": ocr_lines, "Predicted Category": preds})
        st.dataframe(df)

        # Downloadable CSV
        csv = df.to_csv(index=False).encode('utf-8')
        st.download_button(
            label="📥 Download Categorized Data as CSV",
            data=csv,
            file_name='categorized_invoice.csv',
            mime='text/csv',
        )


2025-08-04 21:02:40.038 
  command:

    streamlit run c:\Users\hnegi\INVOICE_CATEGORIZER\venv\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [16]:
import os

# List the saved files
os.listdir()


['.git',
 'cleaned_final_dataset.csv',
 'INVOICE_CATEGORIZER.ipynb',
 'INVOICE_CATEGORIZER.py',
 'model.pkl',
 'README.md',
 'requirements.txt',
 'vectorizer.pkl',
 'venv']

In [None]:
print("✅ Model and vectorizer saved!")
print("📁 Files in directory:", os.listdir())


✅ Model and vectorizer saved!
📁 Files in directory: ['.git', 'cleaned_final_dataset.csv', 'INVOICE_CATEGORIZER.ipynb', 'INVOICE_CATEGORIZER.py', 'model.pkl', 'README.md', 'requirements.txt', 'vectorizer.pkl', 'venv']


: 