In [4]:
# unstructured documentation: https://pypi.org/project/unstructured/
# unstructured Github repo: https://github.com/Unstructured-IO
# A library that contains pre-processing components for partitioning, chunking, cleaning and staging unstructured data (documents) for NLP tasks.

!pip install -U unstructured



In [7]:
# pandas documentation: https://pandas.pydata.org/
# A library to work with data sets.
!pip install pandas



In [8]:
import os
import json
import pandas as pd

# partition_html is a function to ingest texts from website and partition an HTML document into document Element objects.
from unstructured.partition.html import partition_html

In [9]:
# Web scaping:
# Input to partition_html can be a filenmae, file-like object, string or URL that contains HTML content.
# partition_html function does the following:
# 1) identifies HTML element types and classifies elements as HTMLTitle for subheaders, HTMLNarrativeText for regular narrative texts.
# 2) group html elements iteratively and ensure titles are followed by their corresponding narrative texts.
document_elements = partition_html(url="https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87")

In [10]:
print("\n\n".join([str(doc) for doc in document_elements]))

Toggle the table of contents

Tijana Bošković

28 languages

العربية

Беларуская

Български

Bosanski

Čeština

Deutsch

Español

Euskara

فارسی

Français

한국어

Հայերեն

Italiano

Kiswahili

Latviešu

Magyar

Македонски

日本語

Norsk bokmål

Polski

Português

Русский

Српски / srpski

Srpskohrvatski / српскохрватски

Svenska

Türkçe

Українська

中文

Edit links

Article

Talk

English

Read

Edit

View history

Tools

Tools

Actions

Read

Edit

View history

General

What links here

Related changes

Upload file

Special pages

Permanent link

Page information

Cite this page

Get shortened URL

Wikidata item

Print/export

Download as PDF

Printable version

In other projects

Wikimedia Commons

From Wikipedia, the free encyclopedia

Serbian volleyball player

Tijana Bošković Bošković with Eczacıbaşı VitrA in 2016 Personal information Nickname The Boss Nationality Serbian Born  ( 1997-03-08 )  8 March 1997  (age 26) Trebinje ,  Republika Srpska ,  Bosnia and Herzegovina Hometown Bileća

In [11]:
# Store the html content into a JSON.
# The JSON schema contains metadata (which consists of source and title) and page content.
all_elements = []

data = {}
data.setdefault("metadata", {})
data["metadata"]["source"] = "https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87"
data["metadata"]["title"] = "Wikipedia: Tijana Boskovic"
data["page_content"] = " ".join([str(doc) for doc in document_elements[1:]])

all_elements.append(data)

In [12]:
with open("tijana_data.json", mode='w') as f:
    f.write(json.dumps(all_elements, indent=2))

In [13]:
# Display the first data
all_elements[0]

{'metadata': {'source': 'https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87',
  'title': 'Wikipedia: Tijana Boskovic'},
 'page_content': 'Tijana Bošković 28 languages العربية Беларуская Български Bosanski Čeština Deutsch Español Euskara فارسی Français 한국어 Հայերեն Italiano Kiswahili Latviešu Magyar Македонски 日本語 Norsk bokmål Polski Português Русский Српски / srpski Srpskohrvatski / српскохрватски Svenska Türkçe Українська 中文 Edit links Article Talk English Read Edit View history Tools Tools Actions Read Edit View history General What links here Related changes Upload file Special pages Permanent link Page information Cite this page Get shortened URL Wikidata item Print/export Download as PDF Printable version In other projects Wikimedia Commons From Wikipedia, the free encyclopedia Serbian volleyball player Tijana Bošković Bošković with Eczacıbaşı VitrA in 2016 Personal information Nickname The Boss Nationality Serbian Born  ( 1997-03-08 )  8 March 1997  (age\xa026) Trebinje ,  Re

In [22]:
# Web scaping and chunking HTML document:

source = "https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87"
title = "Wikipedia: Tijana Boskovic 2"

# Initialize a list to store processed groups.
all_groups = []
# Initialize a dictionary.
group = {'metadata': {'source': source, 'title': title}, 'page_content': ''}

wiki_tianja_boskovic_page = partition_html(url=source)

for element in wiki_tianja_boskovic_page:
    if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
        if group['page_content']: # check if group['page_content'] is not empty and if so, add the current group to all_groups.
            all_groups.append(group)
            group = {'metadata': {'source':source, 'title': title}, 'page_content': ''}

        group['page_content'] += element.text
    elif 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
        group['page_content'] += '. ' + element.text

# Add the last group if it exists
if group['page_content']:
    all_groups.append(group)

# Display the first 30 chunks of data.
for group in all_groups[:30]:
    print(group)

{'metadata': {'source': 'https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87', 'title': 'Wikipedia: Tijana Boskovic 2'}, 'page_content': 'Toggle the table of contents'}
{'metadata': {'source': 'https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87', 'title': 'Wikipedia: Tijana Boskovic 2'}, 'page_content': 'Tijana Bošković'}
{'metadata': {'source': 'https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87', 'title': 'Wikipedia: Tijana Boskovic 2'}, 'page_content': '28 languages'}
{'metadata': {'source': 'https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87', 'title': 'Wikipedia: Tijana Boskovic 2'}, 'page_content': 'Edit links'}
{'metadata': {'source': 'https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87', 'title': 'Wikipedia: Tijana Boskovic 2'}, 'page_content': 'English'}
{'metadata': {'source': 'https://en.wikipedia.org/wiki/Tijana_Bo%C5%A1kovi%C4%87', 'title': 'Wikipedia: Tijana Boskovic 2'}, 'page_content': 'Tools'}
{'metadata': {'source': 'https://en.wikipedia.org/wik

In [23]:
with open("tijana_data_chunked.json", mode='w') as f:
    f.write(json.dumps(all_groups, indent=2))