<a href="https://colab.research.google.com/github/CiaraAOC/frictionless/blob/main/Frictionless.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Setup:

Installing frictionless, importing necessary libraries, uploading files, reading in data

In [None]:
#!pip install frictionless
import frictionless
import pandas as pd
from pprint import pprint


In [None]:
from google.colab import files
uploaded = files.upload()


Saving TEA01.csv to TEA01 (2).csv


In [None]:
df = pd.read_csv(r'TEA01.csv')
print(df)

                                Statistic  Year  ...    UNIT   VALUE
0    Vehicles Licensed for the First Time  1997  ...  Number  206856
1    Vehicles Licensed for the First Time  1997  ...  Number  153086
2    Vehicles Licensed for the First Time  1997  ...  Number  125818
3    Vehicles Licensed for the First Time  1997  ...  Number   18895
4    Vehicles Licensed for the First Time  1997  ...  Number    1848
..                                    ...   ...  ...     ...     ...
403  Vehicles Licensed for the First Time  2020  ...  Number  101472
404  Vehicles Licensed for the First Time  2020  ...  Number   78541
405  Vehicles Licensed for the First Time  2020  ...  Number   11753
406  Vehicles Licensed for the First Time  2020  ...  Number    2648
407  Vehicles Licensed for the First Time  2020  ...  Number    8530

[408 rows x 5 columns]


Describing data:

Using Frictionless to describe data and print information

In [None]:
from frictionless import describe

resource = describe('TEA01.csv')
pprint(resource)

{'encoding': 'utf-8',
 'format': 'csv',
 'hashing': 'md5',
 'name': 'tea01',
 'path': 'TEA01.csv',
 'profile': 'tabular-data-resource',
 'schema': {'fields': [{'name': 'Statistic', 'type': 'string'},
                       {'name': 'Year', 'type': 'integer'},
                       {'name': 'Taxation Class', 'type': 'string'},
                       {'name': 'UNIT', 'type': 'string'},
                       {'name': 'VALUE', 'type': 'integer'}]},
 'scheme': 'file'}


Extracting Data:

Using Frictionless to extract data from file and print

In [None]:
from frictionless import extract

rows = extract('TEA01.csv')
pprint(rows)

[{'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'All Vehicles',
  'UNIT': 'Number',
  'VALUE': 206856,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Vehicles',
  'UNIT': 'Number',
  'VALUE': 153086,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Private Cars',
  'UNIT': 'Number',
  'VALUE': 125818,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Goods Vehicles',
  'UNIT': 'Number',
  'VALUE': 18895,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Tractors',
  'UNIT': 'Number',
  'VALUE': 1848,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Motor Cycles',
  'UNIT': 'Number',
  'VALUE': 2717,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Exempt Vehicles',
  'UNIT': '

Validating Data:

Using Frictionless to validate data, discover errors within the data and print them. If no errors are present, print []

In [None]:
from frictionless import validate

report = validate('TEA01.csv')
pprint(report.flatten(["rowPosition", "fieldPosition", "code"]))

[]


In [None]:
from frictionless import Resource, FrictionlessException

try:
    resource = Resource('TEA01.csv')
except FrictionlessException as exception:
    pprint(exception.error)
    # Prints the SchemaError metadata in this case

In [None]:
from frictionless import Check, errors

class duplicate_row(Check):
    code = "duplicate-row"
    Errors = [errors.DuplicateRowError]

    def __init__(self, descriptor=None):
        super().__init__(descriptor)
        self.__memory = {}

    def validate_row(self, row):
        text = ",".join(map(str, row.values()))
        hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
        match = self.__memory.get(hash)
        if match:
            note = 'the same as row at position "%s"' % match
            yield errors.DuplicateRowError.from_row(row, note=note)
        self.__memory[hash] = row.row_position

    # Metadata

    metadata_profile = {  # type: ignore
        "type": "object",
        "properties": {},
    }

**Transforming Data:**



In [None]:
from frictionless.plugins.excel import ExcelDialect
resource.write('TEA01.csv', dialect=ExcelDialect(sheet='My Table'))

{'dialect': {'sheet': 'My Table'}, 'path': 'TEA01.csv'}

In [None]:
from frictionless import Resource, transform, steps

# Define source resource
source = Resource(path="TEA01.csv")

# Apply transform steps
target = transform(
    source,
    steps=[
        steps.table_normalize()
    ],
)

# Print resulting schema and data
pprint(target.schema)
pprint(target.read_rows())

{'fields': [{'name': 'Statistic', 'type': 'string'},
            {'name': 'Year', 'type': 'integer'},
            {'name': 'Taxation Class', 'type': 'string'},
            {'name': 'UNIT', 'type': 'string'},
            {'name': 'VALUE', 'type': 'integer'}]}
[{'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'All Vehicles',
  'UNIT': 'Number',
  'VALUE': 206856,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Vehicles',
  'UNIT': 'Number',
  'VALUE': 153086,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Private Cars',
  'UNIT': 'Number',
  'VALUE': 125818,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Goods Vehicles',
  'UNIT': 'Number',
  'VALUE': 18895,
  'Year': 1997},
 {'Statistic': 'Vehicles Licensed for the First Time',
  'Taxation Class': 'New Tractors',
  'UNIT': 'Number',
  'VALUE': 1848,
  'Year': 1997},

In [None]:
from frictionless import Parser

class HtmlParser(Parser):
    requires_loader = True
    supported_types = [
        "string",
    ]

    # Read

    def read_list_stream_create(self):
        pq = helpers.import_from_plugin("pyquery", plugin="html").PyQuery
        dialect = self.resource.dialect

        # Get Page content
        page = pq(self.loader.text_stream.read(), parser="html")

        # Find required table
        if dialect.selector:
            table = pq(page.find(dialect.selector)[0])
        else:
            table = page

        # Stream headers
        data = (
            table.children("thead").children("tr")
            + table.children("thead")
            + table.children("tr")
            + table.children("tbody").children("tr")
        )
        data = [pq(r) for r in data if len(r) > 0]
        first_row = data.pop(0)
        headers = [pq(th).text() for th in first_row.find("th,td")]
        yield headers

        # Stream data
        data = [pq(tr).find("td") for tr in data]
        data = [[pq(td).text() for td in tr] for tr in data if len(tr) > 0]
        yield from data

    # Write

    def write_row_stream(self, resource):
        source = resource
        target = self.resource
        html = "<html><body><table>\n"
        with source:
            for row in source.row_stream:
                if row.row_number == 1:
                    html += "<tr>"
                    for name in row.field_names:
                        html += f"<td>{name}</td>"
                    html += "</tr>\n"
                cells = row.to_list(types=self.supported_types)
                html += "<tr>"
                for cell in cells:
                    html += f"<td>{cell}</td>"
                html += "</tr>\n"
        html += "</table></body></html>"
        with tempfile.NamedTemporaryFile("wt", delete=False) as file:
            file.write(html)
        loader = system.create_loader(target)
        result = loader.write_byte_stream(file.name)
        return result

In [None]:
from frictionless import Dialect, Metadata

class HtmlDialect(Dialect):

    def __init__(self, descriptor=None, *, selector=None):
        self.setinitial("selector", selector)
        super().__init__(descriptor)

    @Metadata.property
    def selector(self):
        """
        Returns:
            str: selector
        """
        return self.get("selector", "table")

    # Expand

    def expand(self):
        """Expand metadata"""
        self.setdefault("selector", self.selector)

    # Metadata

    metadata_profile = {  # type: ignore
        "type": "object",
        "additionalProperties": False,
        "properties": {
            "selector": {"type": "string"},
        },
    }