![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/open-source-nlp/26.0.Benchmark_Unstructured_and_SparkNLP_Files_Ingestion.ipynb)

## Load Mixed Files & Extract Text with Unstructured.io

In [None]:
import os
import requests
import subprocess

# Directory to store the files
output_dir = "all_files"
os.makedirs(output_dir, exist_ok=True)

# GitHub repo info
owner = "JohnSnowLabs"
repo = "spark-nlp"
path = "src/test/resources/reader"

# Fetch the file tree recursively
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref=master"
headers = {"Accept": "application/vnd.github.v3+json"}
response = requests.get(url, headers=headers)
files = response.json()

# Helper function to recursively fetch files
def download_files(file_list, base_path):
    for item in file_list:
        if item["type"] == "file":
            raw_url = item["download_url"]
            file_name = os.path.join(output_dir, os.path.basename(raw_url))
            print(f"Downloading {file_name} ...")
            subprocess.run(["wget", "-q", raw_url, "-O", file_name])
        elif item["type"] == "dir":
            nested_url = item["url"]
            nested_files = requests.get(nested_url, headers=headers).json()
            download_files(nested_files, base_path)

# Start recursive download
download_files(files, path)

print("\n‚úÖ All files downloaded successfully into:", output_dir)

Downloading all_files/semicolon-delimited.csv ...
Downloading all_files/stanley-cups-utf-16.csv ...
Downloading all_files/stanley-cups.csv ...
Downloading all_files/contains-pictures.docx ...
Downloading all_files/doc-img-table.docx ...
Downloading all_files/doc-with-2images.docx ...
Downloading all_files/fake_table.docx ...
Downloading all_files/hierarchy_test.docx ...
Downloading all_files/page-breaks.docx ...
Downloading all_files/email-test-image.eml ...
Downloading all_files/email-test-image.msg ...
Downloading all_files/email-text-attachments.eml ...
Downloading all_files/test-several-attachments.eml ...
Downloading all_files/example-10k.html ...
Downloading all_files/example-bold-strong.html ...
Downloading all_files/example-caption-th.html ...
Downloading all_files/example-div.html ...
Downloading all_files/example-image-paragraph.html ...
Downloading all_files/example-images.html ...
Downloading all_files/example-mix-tags.html ...
Downloading all_files/fake-html.html ...
Downl

üß© 1Ô∏è‚É£ Install dependencies

In [None]:
# Install Unstructured and supporting dependencies
!pip install "unstructured[all-docs]" pillow python-magic

Collecting python-magic
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting unstructured[all-docs]
  Downloading unstructured-0.18.26-py3-none-any.whl.metadata (25 kB)
Collecting filetype (from unstructured[all-docs])
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting emoji (from unstructured[all-docs])
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting dataclasses-json (from unstructured[all-docs])
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting python-iso639 (from unstructured[all-docs])
  Downloading python_iso639-2025.11.16-py3-none-any.whl.metadata (15 kB)
Collecting langdetect (from unstructured[all-docs])
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m981.5/981.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing me

üìÅ 2Ô∏è‚É£ Set up configuration

In [None]:
from pathlib import Path

# Directory containing your documents
INPUT_DIR = Path("/content/all_files")   # üîπ Change to your folder path
# Output file where all extracted text will be saved
OUTPUT_FILE = Path("/content/output_all_text.txt")

‚öôÔ∏è 3Ô∏è‚É£ Import modules and helper functions

In [None]:
import time
from unstructured.partition.auto import partition

def extract_text_from_file(filepath: Path) -> str:
    """
    Use Unstructured.io to partition the file and return full extracted text.
    """
    try:
        elements = partition(filename=str(filepath))
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to read {filepath.name}: {e}")
        return ""

    text_content = []
    for element in elements:
        try:
            txt = getattr(element, "text", None)
            if txt:
                text_content.append(txt)
        except Exception:
            continue

    return "\n".join(text_content)

üßæ Stage 1: Load file list

In [None]:
t0 = time.perf_counter()

files = [
    f for f in INPUT_DIR.rglob("*")
    if f.is_file()
]

t1 = time.perf_counter()
timings = {"list_files": t1 - t0}

print(f"üìÑ Found {len(files)} supported files in {INPUT_DIR}")


üìÑ Found 59 supported files in /content/all_files


üß† Stage 2: Extract text

In [None]:
extract_start = time.perf_counter()
all_text = []

for idx, file_path in enumerate(files, start=1):
    file_t0 = time.perf_counter()

    text = extract_text_from_file(file_path)
    all_text.append(f"====== {file_path.name} ======\n{text}\n")

    file_t1 = time.perf_counter()
    print(f"‚úî [{idx}/{len(files)}] {file_path.name} processed in {file_t1 - file_t0:.2f}s")

extract_end = time.perf_counter()
timings["extract_text"] = extract_end - extract_start

‚úî [1/59] excel-images.xlsx processed in 12.24s
‚úî [2/59] speaker-notes.pptx processed in 0.36s
‚úî [3/59] doc-with-2images.docx processed in 0.07s
‚úî [4/59] hierarchy_test.pdf processed in 9.02s
‚úî [5/59] simple.md processed in 0.74s
‚úî [6/59] short-line-test.txt processed in 0.01s
‚úî [7/59] title-length-test.txt processed in 0.01s
‚ö†Ô∏è Failed to read corrupted.pdf: Unable to get page count. Is poppler installed and in PATH?
‚úî [8/59] corrupted.pdf processed in 0.07s
‚úî [9/59] stanley-cups.csv processed in 0.01s
‚úî [10/59] fake-power-point.pptx processed in 0.02s
‚úî [11/59] 2023-half-year-analyses-by-segment.xlsx processed in 0.19s


yolox_l0.05.onnx:   0%|          | 0.00/217M [00:00<?, ?B/s]



preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/115M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

‚úî [12/59] email-test-image.msg processed in 36.50s
‚úî [13/59] example-mix-tags.html processed in 0.02s
‚úî [14/59] README.md processed in 0.03s
‚ö†Ô∏è Failed to read stanley-cups-utf-16.csv: 'utf-8' codec can't decode byte 0xfe in position 0: invalid start byte
‚úî [15/59] stanley-cups-utf-16.csv processed in 0.00s
‚úî [16/59] email-text-attachments.eml processed in 0.03s
‚úî [17/59] umlauts-non-utf8.md processed in 0.11s
‚úî [18/59] simple-book.md processed in 0.02s
‚úî [19/59] SwitzerlandAlps.jpg processed in 6.34s
‚úî [20/59] fake-power-point-table.pptx processed in 0.03s
‚úî [21/59] title-test.html processed in 0.01s
‚úî [22/59] example-bold-strong.html processed in 0.02s
‚úî [23/59] example-div.html processed in 0.02s
‚úî [24/59] xlsx-subtable-cases.xlsx processed in 0.07s
‚úî [25/59] pdf-title.pdf processed in 0.04s
‚ö†Ô∏è Failed to read test-several-attachments.eml: Unable to get page count. Is poppler installed and in PATH?
‚úî [26/59] test-several-attachments.eml processed 

üíæ Stage 3: Save all extracted text

In [None]:
save_start = time.perf_counter()

OUTPUT_FILE.write_text("\n\n".join(all_text), encoding="utf-8")

save_end = time.perf_counter()
timings["save_output"] = save_end - save_start

print(f"‚úÖ All text saved to: {OUTPUT_FILE}")

‚úÖ All text saved to: /content/output_all_text.txt


üìä Stage 4: Timing Summary + Text Sample

In [None]:
print("\n‚ú® Extraction complete!")
print("üïí Time summary:")
for stage, dt in timings.items():
    print(f" - {stage}: {dt:.3f} seconds")

sample_len = 500
joined = "\n\n".join(all_text)
print("\nüìò Text Sample (first 500 chars):")
print(joined[:sample_len])
if len(joined) > sample_len:
    print("...")



‚ú® Extraction complete!
üïí Time summary:
 - list_files: 0.003 seconds
 - extract_text: 73.732 seconds
 - save_output: 0.001 seconds

üìò Text Sample (first 500 chars):
Country City Switzerland Neuchatel Ecuador Quito Spain Madrid USA Miami


Adding a Bullet Slide
Find the bullet slide layout
Use _TextFrame.text for first bullet
Use _TextFrame.add_paragraph() for subsequent bullets
Here is a lot of text!
Here is some text in a text box!


This is a document with images
First page with an image
Second page with another image


Cha
...


## Load Mixed Files & Extract Text with SparkNLP

üì¶ 1Ô∏è‚É£ Install Spark NLP and Start Spark

In [None]:
# Install Spark NLP and PySpark
!pip install -q --force-reinstall pyspark==3.5.7

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m317.4/317.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m200.5/200.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dataproc-spark-connect 1.0.1 requires pyspark[connect]~=4.0.0, but you have pyspark 3.5.7 which is incompatible.[0m[31m
[0m

In [None]:
!pip install spark-nlp==6.3.0

Collecting spark-nlp==6.3.0
  Downloading spark_nlp-6.3.0-py2.py3-none-any.whl.metadata (19 kB)
Downloading spark_nlp-6.3.0-py2.py3-none-any.whl (744 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m745.0/745.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-6.3.0


In [None]:
import sparknlp
from pyspark.sql import SparkSession

# Start Spark Session
spark = sparknlp.start()

print("üöÄ Spark NLP version:", sparknlp.version())

üöÄ Spark NLP version: 6.3.0


In [None]:
print("Spark version:", spark.version)
print("Scala Version:", spark.sparkContext._jvm.scala.util.Properties.versionString())
print("Java Version:", spark.sparkContext._jvm.java.lang.System.getProperty("java.version"))

Spark version: 3.5.7
Scala Version: version 2.12.18
Java Version: 17.0.17


üìÅ 2Ô∏è‚É£ Define Input/Output Paths

In [None]:
from pathlib import Path

# Directory with mixed docs
INPUT_DIR = "/content/all_files"
# Parquet output
OUTPUT_PARQUET = "/content/sparknlp_output.parquet"

üßæ Stage 1: Load files

In [None]:
import time
from sparknlp.reader.reader_assembler import ReaderAssembler
from pyspark.ml import Pipeline

t0 = time.perf_counter()

reader_assembler = ReaderAssembler() \
    .setContentPath(INPUT_DIR) \
    .setOutputCol("document")

t1 = time.perf_counter()

üß† Stage 2: Extract text

In [None]:
pipeline = Pipeline(stages=[reader_assembler])

t2 = time.perf_counter()

In [None]:
empty_df = spark.createDataFrame([], "string").toDF("text")
model = pipeline.fit(empty_df)
df_spark_nlp = model.transform(empty_df)

t3 = time.perf_counter()

üíæ Stage 3: Save all extracted text

In [None]:
df_spark_nlp.select("document_text.result").write.mode("overwrite").parquet(OUTPUT_PARQUET)
t4 = time.perf_counter()

üìä Stage 4: Timing Summary + Text Sample

In [None]:
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ Timing Report ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ #
print("\nüïí Spark NLP Processing Timings")
print(f" - setContentPath:        {t1 - t0:.3f} sec")
print(f" - Pipeline creation:     {t2 - t1:.3f} sec")
print(f" - Fit + Transform:       {t3 - t2:.3f} sec")
print(f" - Save to Parquet:       {t4 - t3:.3f} sec")
print(f" - Total:                 {t4 - t0:.3f} sec")


üïí Spark NLP Processing Timings
 - setContentPath:        3.135 sec
 - Pipeline creation:     0.008 sec
 - Fit + Transform:       13.836 sec
 - Save to Parquet:       17.387 sec
 - Total:                 34.366 sec


In [None]:
# Load the saved Parquet file into a new DataFrame
df_loaded = spark.read.parquet(OUTPUT_PARQUET)

print(f"‚úÖ Loaded {df_loaded.count()} records from Parquet.")

‚úÖ Loaded 59 records from Parquet.


In [None]:
# Show the first 10 rows (truncated to 200 characters for readability)
df_loaded.show(10, truncate=200)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                                                                                  result|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Sample Title Text\nThis is an example paragraph. Below you'll see the first table containing sample data.\nHere is some additional text between the tables. You can add any information or context y...|
|                                                                                                [This SHOULD be a title\nThis is a normal paragraph.\nThis MIGHT be a title\nAnother regula