![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/open-source-nlp/26.0.Benchmark_Unstructured_and_SparkNLP_Files_Ingestion.ipynb)

## Load Mixed Files & Extract Text with Unstructured

In [None]:
import os
import requests
import subprocess

# Directory to store the files
output_dir = "all_files"
os.makedirs(output_dir, exist_ok=True)

# GitHub repo info
owner = "JohnSnowLabs"
repo = "spark-nlp"
path = "src/test/resources/reader"

# Fetch the file tree recursively
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref=master"
headers = {"Accept": "application/vnd.github.v3+json"}
response = requests.get(url, headers=headers)
files = response.json()

# Helper function to recursively fetch files
def download_files(file_list, base_path):
    for item in file_list:
        if item["type"] == "file":
            raw_url = item["download_url"]
            file_name = os.path.join(output_dir, os.path.basename(raw_url))
            print(f"Downloading {file_name} ...")
            subprocess.run(["wget", "-q", raw_url, "-O", file_name])
        elif item["type"] == "dir":
            nested_url = item["url"]
            nested_files = requests.get(nested_url, headers=headers).json()
            download_files(nested_files, base_path)

# Start recursive download
download_files(files, path)

print("\n‚úÖ All files downloaded successfully into:", output_dir)

üß© 1Ô∏è‚É£ Install dependencies

In [None]:
# Install Unstructured and supporting dependencies
!pip install "unstructured[all-docs]" pillow python-magic

üìÅ 2Ô∏è‚É£ Set up configuration

In [None]:
from pathlib import Path

# Directory containing your documents
INPUT_DIR = Path("/content/all_files")   # üîπ Change to your folder path
# Output file where all extracted text will be saved
OUTPUT_FILE = Path("/content/output_all_text.txt")

‚öôÔ∏è 3Ô∏è‚É£ Import modules and helper functions

In [None]:
import time
from unstructured.partition.auto import partition

def extract_text_from_file(filepath: Path) -> str:
    """
    Use Unstructured.io to partition the file and return full extracted text.
    """
    try:
        elements = partition(filename=str(filepath))
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to read {filepath.name}: {e}")
        return ""

    text_content = []
    for element in elements:
        try:
            txt = getattr(element, "text", None)
            if txt:
                text_content.append(txt)
        except Exception:
            continue

    return "\n".join(text_content)

üßæ Stage 1: Load file list

In [None]:
t0 = time.perf_counter()

files = [
    f for f in INPUT_DIR.rglob("*")
    if f.is_file()
]

t1 = time.perf_counter()
timings = {"list_files": t1 - t0}

print(f"üìÑ Found {len(files)} supported files in {INPUT_DIR}")


üß† Stage 2: Extract text

In [None]:
extract_start = time.perf_counter()
all_text = []

for idx, file_path in enumerate(files, start=1):
    file_t0 = time.perf_counter()

    text = extract_text_from_file(file_path)
    all_text.append(f"====== {file_path.name} ======\n{text}\n")

    file_t1 = time.perf_counter()
    print(f"‚úî [{idx}/{len(files)}] {file_path.name} processed in {file_t1 - file_t0:.2f}s")

extract_end = time.perf_counter()
timings["extract_text"] = extract_end - extract_start

üíæ Stage 3: Save all extracted text

In [None]:
save_start = time.perf_counter()

OUTPUT_FILE.write_text("\n\n".join(all_text), encoding="utf-8")

save_end = time.perf_counter()
timings["save_output"] = save_end - save_start

print(f"‚úÖ All text saved to: {OUTPUT_FILE}")

üìä Stage 4: Timing Summary + Text Sample

In [None]:
print("\n‚ú® Extraction complete!")
print("üïí Time summary:")
for stage, dt in timings.items():
    print(f" - {stage}: {dt:.3f} seconds")

sample_len = 500
joined = "\n\n".join(all_text)
print("\nüìò Text Sample (first 500 chars):")
print(joined[:sample_len])
if len(joined) > sample_len:
    print("...")


## Load Mixed Files & Extract Text with SparkNLP

üì¶ 1Ô∏è‚É£ Install Spark NLP and Start Spark

In [None]:
# Install Spark NLP and PySpark
!pip install -q --force-reinstall pyspark==3.5.7

In [None]:
!pip install spark-nlp==6.3.0

In [None]:
import sparknlp
from pyspark.sql import SparkSession

# Start Spark Session
spark = sparknlp.start()

print("üöÄ Spark NLP version:", sparknlp.version())

In [None]:
print("Spark version:", spark.version)
print("Scala Version:", spark.sparkContext._jvm.scala.util.Properties.versionString())
print("Java Version:", spark.sparkContext._jvm.java.lang.System.getProperty("java.version"))

üìÅ 2Ô∏è‚É£ Define Input/Output Paths

In [None]:
from pathlib import Path

# Directory with mixed docs
INPUT_DIR = "/content/all_files"
# Parquet output
OUTPUT_PARQUET = "/content/sparknlp_output.parquet"

üßæ Stage 1: Load files

In [None]:
import time
from sparknlp.reader.reader_assembler import ReaderAssembler
from pyspark.ml import Pipeline

t0 = time.perf_counter()

reader_assembler = ReaderAssembler() \
    .setContentPath(INPUT_DIR) \
    .setOutputCol("document")

t1 = time.perf_counter()

üß† Stage 2: Extract text

In [None]:
pipeline = Pipeline(stages=[reader_assembler])

t2 = time.perf_counter()

In [None]:
empty_df = spark.createDataFrame([], "string").toDF("text")
model = pipeline.fit(empty_df)
df_spark_nlp = model.transform(empty_df)

t3 = time.perf_counter()

üíæ Stage 3: Save all extracted text

In [None]:
df_spark_nlp.select("document_text.result").write.mode("overwrite").parquet(OUTPUT_PARQUET)
t4 = time.perf_counter()

üìä Stage 4: Timing Summary + Text Sample

In [None]:
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ Timing Report ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ #
print("\nüïí Spark NLP Processing Timings")
print(f" - setContentPath:        {t1 - t0:.3f} sec")
print(f" - Pipeline creation:     {t2 - t1:.3f} sec")
print(f" - Fit + Transform:       {t3 - t2:.3f} sec")
print(f" - Save to Parquet:       {t4 - t3:.3f} sec")
print(f" - Total:                 {t4 - t0:.3f} sec")

In [None]:
# Load the saved Parquet file into a new DataFrame
df_loaded = spark.read.parquet(OUTPUT_PARQUET)

print(f"‚úÖ Loaded {df_loaded.count()} records from Parquet.")

In [None]:
# Show the first 10 rows (truncated to 200 characters for readability)
df_loaded.show(10, truncate=200)