# Great Expectations Task

## 1. Install Great Expectations Library


In [None]:
!pip install great_expectations

##2. Import Necessary Libraries

In [None]:
import pandas as pd
import great_expectations as gx

##3. Load Labels.csv

Download and upload the [Labels.csv](https://github.com/zubxxr/SOFE3980U-Lab5/blob/main/Labels.csv) into this notebook, and then load the file.

In [None]:
df = pd.read_csv(r"C:\Users\12269\Downloads\Labels.csv")


##4. Preview the Dataset

In [None]:
df.head()

 Timestamp  Car1_Location_X  ...  pedestrianLocationX_BottomRight  pedestrianLocationY_BottomRight
0  1736796157       -51.402977  ...                              610                              410
1  1736796167       -53.819637  ...                              594                              415
2  1736796178       -50.239144  ...                              854                              720
3  1736796188       -53.707220  ...                              567                              425
4  1736796198       -52.053721  ...                              537                              413

[5 rows x 14 columns]
>>>

##5. Set Up Great Expectations Context and Data Source

In [None]:
# Write code here
context = gx.get_context()
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")
     

##6. Define and Create a Data Batch

In [None]:
# Write code here
batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})


##7. Define Three Expectations for Column Values

Using this [link](https://greatexpectations.io/expectations/), choose three expectation functions and apply them to the labels dataset in a relevant manner.

You should replace the 'ExpectColumnValuesToBeBetween' function with other functions you select from the link.

You can also check the format/parameters required of each function when you click "See more" on the function.

In [None]:
## Original Function
expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="column", min_value=0, max_value=20
)

## Example Function

## This function only requires a column parameter, and not a max or min value
expectation = gx.expectations.ExpectColumnValuesToBeUnique(
    column="column"
)

### Expectation 1

In [None]:
# Write code here
ExpectColumnMinToBeBetween(
    column="test",
    min_value=.5,
    max_value=1
)

### Validate Data Against Expectation 1

In [None]:
# Write code here
>>> validation_result = batch.validate(expectation)
Calculating Metrics:  75%|███████████████████████████████████████████████████                 | 3/4 [00:00<00:00, 187.35it/s]
>>> print(validation_result)
{
  "success": false,
  "expectation_config": {
    "type": "expect_column_min_to_be_between",
    "kwargs": {
      "column": "test",
      "min_value": 0.5,
      "max_value": 1.0,
      "batch_id": "pandas-pd dataframe asset"
    },
    "meta": {}
  },
  "result": {},
  "meta": {},
  "exception_info": {
    "('column.min', '62ee768614516cf86253241845707c28', ())": {
      "exception_traceback": "Traceback (most recent call last):\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\execution_engine\\execution_engine.py\", line 533, in _process_direct_and_bundled_metric_computation_configurations\n    metric_computation_configuration.metric_fn(  # type: ignore[misc] # F not callable\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\expectations\\metrics\\metric_provider.py\", line 60, in inner_func\n    return metric_fn(*args, **kwargs)\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\expectations\\metrics\\column_aggregate_metric_provider.py\", line 77, in inner_func\n    metric_domain_kwargs = get_dbms_compatible_metric_domain_kwargs(\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\expectations\\metrics\\util.py\", line 719, in get_dbms_compatible_metric_domain_kwargs\n    column_name: str | sqlalchemy.quoted_name = get_dbms_compatible_column_names(\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\expectations\\metrics\\util.py\", line 789, in get_dbms_compatible_column_names\n    _verify_column_names_exist_and_get_normalized_typed_column_names_map(\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\expectations\\metrics\\util.py\", line 874, in _verify_column_names_exist_and_get_normalized_typed_column_names_map\n    raise gx_exceptions.InvalidMetricAccessorDomainKwargsKeyError(\ngreat_expectations.exceptions.exceptions.InvalidMetricAccessorDomainKwargsKeyError: Error: The column \"test\" in BatchData does not exist.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\validator\\validation_graph.py\", line 276, in _resolve\n    self._execution_engine.resolve_metrics(\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\execution_engine\\execution_engine.py\", line 279, in resolve_metrics\n    return self._process_direct_and_bundled_metric_computation_configurations(\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\execution_engine\\execution_engine.py\", line 538, in _process_direct_and_bundled_metric_computation_configurations\n    raise gx_exceptions.MetricResolutionError(\ngreat_expectations.exceptions.exceptions.MetricResolutionError: Error: The column \"test\" in BatchData does not exist.\n",
      "exception_message": "Error: The column \"test\" in BatchData does not exist.",
      "raised_exception": true
    }
  }
}
>>>

### Expectation 2

In [None]:
# Write code here
ExpectColumnDistinctValuesToBeInSet(
    column="test",
    value_set=[1, 2, 3, 4, 5]
)

### Validate Data Against Expectation 2

In [None]:
# Write code here
>>> validation_result = batch.validate(expectation)
Calculating Metrics:  75%|███████████████████████████████████████████████████                 | 3/4 [00:00<00:00, 107.12it/s]
>>> print(validation_result)
{
  "success": false,
  "expectation_config": {
    "type": "expect_column_distinct_values_to_be_in_set",
    "kwargs": {
      "column": "test",
      "value_set": [
        1,
        2,
        3,
        4,
        5
      ],
      "batch_id": "pandas-pd dataframe asset"
    },
    "meta": {}
  },
  "result": {},
  "meta": {},
  "exception_info": {
    "('column.value_counts', '62ee768614516cf86253241845707c28', '817a2a474179468a8636eb1eccaf4fdf')": {
      "exception_traceback": "Traceback (most recent call last):\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\pandas\\core\\indexes\\base.py\", line 3653, in get_loc\n    return self._engine.get_loc(casted_key)\n  File \"pandas\\_libs\\index.pyx\", line 147, in pandas._libs.index.IndexEngine.get_loc\n  File \"pandas\\_libs\\index.pyx\", line 176, in pandas._libs.index.IndexEngine.get_loc\n  File \"pandas\\_libs\\hashtable_class_helper.pxi\", line 7080, in pandas._libs.hashtable.PyObjectHashTable.get_item\n  File \"pandas\\_libs\\hashtable_class_helper.pxi\", line 7088, in pandas._libs.hashtable.PyObjectHashTable.get_item\nKeyError: 'test'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\execution_engine\\execution_engine.py\", line 533, in _process_direct_and_bundled_metric_computation_configurations\n    metric_computation_configuration.metric_fn(  # type: ignore[misc] # F not callable\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\expectations\\metrics\\metric_provider.py\", line 60, in inner_func\n    return metric_fn(*args, **kwargs)\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\expectations\\metrics\\column_aggregate_metrics\\column_value_counts.py\", line 55, in _pandas\n    counts: pd.Series = df[column].value_counts()\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\pandas\\core\\frame.py\", line 3761, in __getitem__\n    indexer = self.columns.get_loc(key)\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\pandas\\core\\indexes\\base.py\", line 3655, in get_loc\n    raise KeyError(key) from err\nKeyError: 'test'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\validator\\validation_graph.py\", line 276, in _resolve\n    self._execution_engine.resolve_metrics(\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\execution_engine\\execution_engine.py\", line 279, in resolve_metrics\n    return self._process_direct_and_bundled_metric_computation_configurations(\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\execution_engine\\execution_engine.py\", line 538, in _process_direct_and_bundled_metric_computation_configurations\n    raise gx_exceptions.MetricResolutionError(\ngreat_expectations.exceptions.exceptions.MetricResolutionError: 'test'\n",
      "exception_message": "'test'",
      "raised_exception": true
    }
  }
}
>>>

### Expectation 3

In [None]:
# Write code here
ExpectColumnDistinctValuesToContainSet(
    column="test",
    value_set=[1, 4]
)

### Validate Data Against Expectation 3

In [None]:
# Write code here
>>> validation_result = batch.validate(expectation)
Calculating Metrics:  75%|███████████████████████████████████████████████████                 | 3/4 [00:00<00:00, 231.01it/s]
>>> print(validation_result)
{
  "success": false,
  "expectation_config": {
    "type": "expect_column_distinct_values_to_contain_set",
    "kwargs": {
      "column": "test",
      "value_set": [
        1,
        4
      ],
      "batch_id": "pandas-pd dataframe asset"
    },
    "meta": {}
  },
  "result": {},
  "meta": {},
  "exception_info": {
    "('column.value_counts', '62ee768614516cf86253241845707c28', '817a2a474179468a8636eb1eccaf4fdf')": {
      "exception_traceback": "Traceback (most recent call last):\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\pandas\\core\\indexes\\base.py\", line 3653, in get_loc\n    return self._engine.get_loc(casted_key)\n  File \"pandas\\_libs\\index.pyx\", line 147, in pandas._libs.index.IndexEngine.get_loc\n  File \"pandas\\_libs\\index.pyx\", line 176, in pandas._libs.index.IndexEngine.get_loc\n  File \"pandas\\_libs\\hashtable_class_helper.pxi\", line 7080, in pandas._libs.hashtable.PyObjectHashTable.get_item\n  File \"pandas\\_libs\\hashtable_class_helper.pxi\", line 7088, in pandas._libs.hashtable.PyObjectHashTable.get_item\nKeyError: 'test'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\execution_engine\\execution_engine.py\", line 533, in _process_direct_and_bundled_metric_computation_configurations\n    metric_computation_configuration.metric_fn(  # type: ignore[misc] # F not callable\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\expectations\\metrics\\metric_provider.py\", line 60, in inner_func\n    return metric_fn(*args, **kwargs)\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\expectations\\metrics\\column_aggregate_metrics\\column_value_counts.py\", line 55, in _pandas\n    counts: pd.Series = df[column].value_counts()\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\pandas\\core\\frame.py\", line 3761, in __getitem__\n    indexer = self.columns.get_loc(key)\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\pandas\\core\\indexes\\base.py\", line 3655, in get_loc\n    raise KeyError(key) from err\nKeyError: 'test'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\validator\\validation_graph.py\", line 276, in _resolve\n    self._execution_engine.resolve_metrics(\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\execution_engine\\execution_engine.py\", line 279, in resolve_metrics\n    return self._process_direct_and_bundled_metric_computation_configurations(\n  File \"C:\\Windows\\System32\\ge_env\\lib\\site-packages\\great_expectations\\execution_engine\\execution_engine.py\", line 538, in _process_direct_and_bundled_metric_computation_configurations\n    raise gx_exceptions.MetricResolutionError(\ngreat_expectations.exceptions.exceptions.MetricResolutionError: 'test'\n",
      "exception_message": "'test'",
      "raised_exception": true
    }
  }
}
>>>