<a href="https://colab.research.google.com/github/CiaraAOC/frictionless/blob/main/Frictionless.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Setup:

In [14]:
!pip install frictionless #install necessary modules and libraries
import frictionless
import pandas as pd
from pprint import pprint




In [15]:
import requests
response = requests.get('https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/TEA01/CSV/1.0/en')
response.text
print(response.status_code) #prints the status code of the API
#a status code of 200 means the api is working
#if the status code begins with a 4, there is an error

200


In [16]:
df = pd.read_csv("https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/TEA01/CSV/1.0/en") #link to desired website table
print(df)

    STATISTIC                             Statistic  ...    UNIT   VALUE
0       TEA01  Vehicles Licensed for the First Time  ...  Number  206856
1       TEA01  Vehicles Licensed for the First Time  ...  Number  153086
2       TEA01  Vehicles Licensed for the First Time  ...  Number  125818
3       TEA01  Vehicles Licensed for the First Time  ...  Number   18895
4       TEA01  Vehicles Licensed for the First Time  ...  Number    1848
..        ...                                   ...  ...     ...     ...
403     TEA01  Vehicles Licensed for the First Time  ...  Number  101472
404     TEA01  Vehicles Licensed for the First Time  ...  Number   78541
405     TEA01  Vehicles Licensed for the First Time  ...  Number   11753
406     TEA01  Vehicles Licensed for the First Time  ...  Number    2648
407     TEA01  Vehicles Licensed for the First Time  ...  Number    8530

[408 rows x 8 columns]


Describing data:


In [17]:
from frictionless import describe #generates metadata describing layout/contents of data

resource = describe(df)
pprint(resource)

{'data':     STATISTIC                             Statistic  ...    UNIT   VALUE
0       TEA01  Vehicles Licensed for the First Time  ...  Number  206856
1       TEA01  Vehicles Licensed for the First Time  ...  Number  153086
2       TEA01  Vehicles Licensed for the First Time  ...  Number  125818
3       TEA01  Vehicles Licensed for the First Time  ...  Number   18895
4       TEA01  Vehicles Licensed for the First Time  ...  Number    1848
..        ...                                   ...  ...     ...     ...
403     TEA01  Vehicles Licensed for the First Time  ...  Number  101472
404     TEA01  Vehicles Licensed for the First Time  ...  Number   78541
405     TEA01  Vehicles Licensed for the First Time  ...  Number   11753
406     TEA01  Vehicles Licensed for the First Time  ...  Number    2648
407     TEA01  Vehicles Licensed for the First Time  ...  Number    8530

[408 rows x 8 columns],
 'format': 'pandas',
 'hashing': 'md5',
 'name': 'memory',
 'profile': 'tabular-data-resou

Extracting Data:


In [18]:
from frictionless import extract #read and normalise data

rows = extract(df)
pprint(rows)

[{'C02172V02618': '-',
  'STATISTIC': 'TEA01',
  'Statistic': 'Vehicles Licensed for the First Time',
  'TLIST(A1)': 1997,
  'Taxation Class': 'All Vehicles',
  'UNIT': 'Number',
  'VALUE': 206856,
  'Year': 1997},
 {'C02172V02618': '01',
  'STATISTIC': 'TEA01',
  'Statistic': 'Vehicles Licensed for the First Time',
  'TLIST(A1)': 1997,
  'Taxation Class': 'New Vehicles',
  'UNIT': 'Number',
  'VALUE': 153086,
  'Year': 1997},
 {'C02172V02618': '011',
  'STATISTIC': 'TEA01',
  'Statistic': 'Vehicles Licensed for the First Time',
  'TLIST(A1)': 1997,
  'Taxation Class': 'New Private Cars',
  'UNIT': 'Number',
  'VALUE': 125818,
  'Year': 1997},
 {'C02172V02618': '012',
  'STATISTIC': 'TEA01',
  'Statistic': 'Vehicles Licensed for the First Time',
  'TLIST(A1)': 1997,
  'Taxation Class': 'New Goods Vehicles',
  'UNIT': 'Number',
  'VALUE': 18895,
  'Year': 1997},
 {'C02172V02618': '013',
  'STATISTIC': 'TEA01',
  'Statistic': 'Vehicles Licensed for the First Time',
  'TLIST(A1)': 1997,
 

Validating Data:


In [19]:
from frictionless import validate #detects errors in the file

report = validate(df)
pprint(report.flatten(["rowPosition", "fieldPosition", "code"])) 

[]


In [20]:
from frictionless import Resource, FrictionlessException

try:
    resource = Resource(df)
except FrictionlessException as exception:
    pprint(exception.error)
    # Prints the SchemaError metadata in this case

In [21]:
from frictionless import Check, errors

class duplicate_row(Check):
    code = "duplicate-row"
    Errors = [errors.DuplicateRowError]

    def __init__(self, descriptor=None):
        super().__init__(descriptor)
        self.__memory = {}

    def validate_row(self, row):
        text = ",".join(map(str, row.values()))
        hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
        match = self.__memory.get(hash)
        if match:
            note = 'the same as row at position "%s"' % match
            yield errors.DuplicateRowError.from_row(row, note=note)
        self.__memory[hash] = row.row_position

    # Metadata

    metadata_profile = {  # type: ignore
        "type": "object",
        "properties": {},
    }

**Transforming Data:**



In [22]:
from frictionless import Resource
resource = Resource(df)
from frictionless.plugins.excel import ExcelDialect
resource.write(df, dialect=ExcelDialect(sheet='My Table'))

{'data':     STATISTIC                             Statistic  ...    UNIT   VALUE
 0       TEA01  Vehicles Licensed for the First Time  ...  Number  206856
 1       TEA01  Vehicles Licensed for the First Time  ...  Number  153086
 2       TEA01  Vehicles Licensed for the First Time  ...  Number  125818
 3       TEA01  Vehicles Licensed for the First Time  ...  Number   18895
 4       TEA01  Vehicles Licensed for the First Time  ...  Number    1848
 ..        ...                                   ...  ...     ...     ...
 403     TEA01  Vehicles Licensed for the First Time  ...  Number  101472
 404     TEA01  Vehicles Licensed for the First Time  ...  Number   78541
 405     TEA01  Vehicles Licensed for the First Time  ...  Number   11753
 406     TEA01  Vehicles Licensed for the First Time  ...  Number    2648
 407     TEA01  Vehicles Licensed for the First Time  ...  Number    8530
 
 [408 rows x 8 columns], 'dialect': {'sheet': 'My Table'}}

In [23]:
from frictionless import Resource, transform, steps

# Define source resource
source = Resource(path="https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/TEA01/CSV/1.0/en")

# Apply transform steps
target = transform(
    source,
    steps=[
        steps.table_normalize()
    ],
)

# Print resulting schema and data
pprint(target.schema)


{}


In [26]:
from frictionless import Parser

class HtmlParser(Parser):
    requires_loader = True
    supported_types = [
        "string",
    ]

    # Read

    def read_list_stream_create(self):
        pq = helpers.import_from_plugin("pyquery", plugin="html").PyQuery
        dialect = self.resource.dialect

        # Get Page content
        page = pq(self.loader.text_stream.read(), parser="html")

        # Find required table
        if dialect.selector:
            table = pq(page.find(dialect.selector)[0])
        else:
            table = page

        # Stream headers
        data = (
            table.children("thead").children("tr")
            + table.children("thead")
            + table.children("tr")
            + table.children("tbody").children("tr")
        )
        data = [pq(r) for r in data if len(r) > 0]
        first_row = data.pop(0)
        headers = [pq(th).text() for th in first_row.find("th,td")]
        yield headers

        # Stream data
        data = [pq(tr).find("td") for tr in data]
        data = [[pq(td).text() for td in tr] for tr in data if len(tr) > 0]
        yield from data

    # Write

    def write_row_stream(self, resource):
        source = resource
        target = self.resource
        html = "<html><body><table>\n"
        with source:
            for row in source.row_stream:
                if row.row_number == 1:
                    html += "<tr>"
                    for name in row.field_names:
                        html += f"<td>{name}</td>"
                    html += "</tr>\n"
                cells = row.to_list(types=self.supported_types)
                html += "<tr>"
                for cell in cells:
                    html += f"<td>{cell}</td>"
                html += "</tr>\n"
        html += "</table></body></html>"
        with tempfile.NamedTemporaryFile("wt", delete=False) as file:
            file.write(html)
        loader = system.create_loader(target)
        result = loader.write_byte_stream(file.name)
        return result

In [27]:
from frictionless import Dialect, Metadata

class HtmlDialect(Dialect):

    def __init__(self, descriptor=None, *, selector=None):
        self.setinitial("selector", selector)
        super().__init__(descriptor)

    @Metadata.property
    def selector(self):
        """
        Returns:
            str: selector
        """
        return self.get("selector", "table")

    # Expand

    def expand(self):
        """Expand metadata"""
        self.setdefault("selector", self.selector)

    # Metadata

    metadata_profile = {  # type: ignore
        "type": "object",
        "additionalProperties": False,
        "properties": {
            "selector": {"type": "string"},
        },
    }