<a href="https://colab.research.google.com/github/MaCroDmT/SSL-Intern/blob/main/Image_to_Excel_Converter_(Colab).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [8]:
# ==============================================================================
# Colab Image to Excel Converter
# This script is designed to run in a Google Colab environment.
# It uses Tesseract OCR to extract text from an uploaded image (JPEG/PNG),
# attempts to structure the text, and saves the result to an Excel file (.xlsx).
#
# NOTE: This version uses advanced coordinate-based logic for table reconstruction.
# CRITICAL UPDATE: It now filters columns based on frequency (keeping only the most
# consistent columns) to aggressively eliminate scattered noise from headers/titles.
# ==============================================================================

import pandas as pd
import pytesseract
from PIL import Image
import io
import os
from google.colab import files

# --- 1. Installation and Setup ---
# Tesseract is an external program that needs to be installed on the Colab machine.
print("--- Installing Tesseract OCR Engine ---")
!sudo apt install tesseract-ocr
# Install the Python bindings for Tesseract and pandas for Excel output
!pip install pytesseract pandas

# --- 2. Core Functions ---

def process_image_to_excel():
    """
    Handles the entire workflow: upload, OCR, data structuring, and download.
    """
    print("\n--- Starting Image to Excel Conversion ---")

    # 2.1 File Upload
    print("Please upload your JPEG or PNG file (e.g., a photo of a table or list).")
    uploaded = files.upload()

    if not uploaded:
        print("No file was uploaded. Exiting.")
        return

    # Get the first uploaded file name
    file_name = next(iter(uploaded))
    print(f"Successfully uploaded: {file_name}")

    try:
        # 2.2 Load Image Data
        # Use io.BytesIO to read the uploaded bytes data into a PIL Image object
        image_bytes = uploaded[file_name]
        img = Image.open(io.BytesIO(image_bytes))

        # 2.3 Perform OCR (Text and Data Extraction)
        print("Performing Optical Character Recognition (OCR) and extracting coordinates...")
        # Use PSM 6 (Assume a single uniform block of text) for table extraction
        custom_config = r'--psm 6'

        # Use image_to_data to get bounding boxes (crucial for coordinate-based alignment)
        data = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DATAFRAME)

        # 2.4 Structuring the Data - ADVANCED COORDINATE-BASED LOGIC

        # Filter out empty text and low-confidence readings (confidence <= 60 is often unreliable)
        df_words = data[(data.conf > 60) & (data.text.notna()) & (data.text != '')].copy()

        if df_words.empty:
            print("OCR failed to reliably find text with sufficient confidence.")
            return

        # Sort by top, then left coordinate for sequential processing
        df_words = df_words.sort_values(by=['top', 'left']).reset_index(drop=True)

        # --- Step 1: Group words into logical rows based on 'top' coordinate ---
        # Tolerance in pixels for vertical alignment (words close together belong to the same row)
        TOP_TOLERANCE = 10

        current_row_id = 0
        df_words['row_id'] = 0

        if not df_words.empty:
            df_words.loc[0, 'row_id'] = 0

            for i in range(1, len(df_words)):
                prev_top = df_words.loc[i-1, 'top']
                curr_top = df_words.loc[i, 'top']

                # If the vertical difference is greater than the tolerance, start a new row
                if curr_top - prev_top > TOP_TOLERANCE:
                    current_row_id += 1

                df_words.loc[i, 'row_id'] = current_row_id

        # --- Step 2: Determine column buckets based on 'left' coordinate ---

        all_lefts = df_words['left'].unique()
        all_lefts.sort()

        # Horizontal tolerance in pixels for column alignment (increased slightly for more robust clustering)
        COLUMN_TOLERANCE = 30
        column_centers = []

        # Cluster the 'left' coordinates using a simple greedy approach
        if len(all_lefts) > 0:
            column_centers.append(all_lefts[0])
            for l in all_lefts:
                # Compare against the last center. If too far, start a new center.
                if l - column_centers[-1] > COLUMN_TOLERANCE:
                    column_centers.append(l)

        # Assign each word to its determined column bucket index
        def get_column_index(word_left):
            min_diff = float('inf')
            col_index = 0
            for i, center in enumerate(column_centers):
                diff = abs(word_left - center)
                if diff < min_diff:
                    min_diff = diff
                    col_index = i
            return col_index

        df_words['col_index'] = df_words['left'].apply(get_column_index)

        # CRITICAL FILTER STEP: Filter to only the most frequently occurring columns (removes noise)
        MAX_REPORT_COLUMNS = 25 # Set a sensible limit to filter out extreme noise columns

        # 1. Count how often each column index appears
        column_counts = df_words['col_index'].value_counts()

        # 2. Select the indices of the top MAX_REPORT_COLUMNS most frequent columns
        main_col_indices = column_counts.head(MAX_REPORT_COLUMNS).index.sort_values().tolist()

        # 3. Create a map from the clustered index (0, 1, 5, 10, ...) to a clean sequence (0, 1, 2, 3, ...)
        col_index_map = {original_idx: new_idx for new_idx, original_idx in enumerate(main_col_indices)}

        # 4. Filter the dataframe to only include words belonging to the main columns
        df_filtered = df_words[df_words['col_index'].isin(main_col_indices)].copy()

        # 5. Apply the clean, sequential column index
        df_filtered['clean_col_index'] = df_filtered['col_index'].map(col_index_map)
        max_cols = len(main_col_indices) # This is the new, filtered max_cols

        # --- Step 3: Reconstruct the table using filtered data ---
        reconstructed_table = []

        # Group by the original row_id
        for row_id, row_group in df_filtered.groupby('row_id'):
            # Initialize the row with empty strings based on the new max_cols
            row_data = [''] * max_cols

            # Group by the clean_col_index
            for col_index, cell_group in row_group.groupby('clean_col_index'):
                # Combine all words in this cell's bucket, sorted by their 'left' coordinate
                cell_content = ' '.join(cell_group.sort_values('left')['text'].tolist())

                if col_index < max_cols:
                    row_data[col_index] = cell_content

            reconstructed_table.append(row_data)

        # Remove empty rows that might have been created by noise
        final_rows = [row for row in reconstructed_table if any(cell.strip() for cell in row)]

        if not final_rows:
            print("After cleaning, no structured data could be extracted.")
            return

        print(f"\n--- Filtered & Coordinate-Based Extracted Table Preview ({len(final_rows)} rows) ---")
        # Print the first few reconstructed rows
        for row in final_rows[:5]:
            # Simple print of the row array
            print(row)
        if len(final_rows) > 5:
             print("...")
        print("----------------------------------------------------------------------")


        # 2.5 Create Pandas DataFrame
        # The 'header=False' export preserves the first rows as data, not column titles.
        df = pd.DataFrame(final_rows)

        # 2.6 Save to Excel and Download
        # Updated output file name to V4 to reflect the column filtering improvement
        output_file_name = f"{os.path.splitext(file_name)[0]}_converted_V4.xlsx"
        df.to_excel(output_file_name, index=False, header=False)

        print(f"\nSuccessfully created Excel file: {output_file_name}")
        print("Starting download...")
        files.download(output_file_name)
        print("Download complete.")

    except Exception as e:
        # Print a clearer error message in case of Tesseract or file issues
        print(f"An error occurred during processing: {e}")
        print("Troubleshooting: Please ensure your uploaded file is a valid image (JPEG/PNG) and that it contains clear, readable text.")

# --- 3. Execution ---
if __name__ == '__main__':
    process_image_to_excel()


--- Installing Tesseract OCR Engine ---
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.

--- Starting Image to Excel Conversion ---
Please upload your JPEG or PNG file (e.g., a photo of a table or list).


Saving Media.jpg to Media (4).jpg
Successfully uploaded: Media (4).jpg
Performing advanced Table Layout Analysis (PSM 3) and TSV extraction...
An error occurred during processing: type object 'Output' has no attribute 'TSV'
Troubleshooting: Ensure your file is a valid image (JPEG/PNG) and the text is clear.
