# Great Expectation

In [1]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [2]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-cleaned'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'car_crash'
path_to_data = 'P2M3_Allen_data_cleaned.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [3]:
# Creat an expectation suite
expectation_suite_name = 'expectation-car-crash-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,year,month,day,weekend,hour,collision_type,injury_type,primary_factor,reported_location,unique_id
0,2015,1,5,Weekday,0,2-Car,No injury/unknown,OTHER (DRIVER) - EXPLAIN IN NARRATIVE,1ST & FESS,0
1,2015,1,6,Weekday,1500,2-Car,No injury/unknown,FOLLOWING TOO CLOSELY,2ND & COLLEGE,1
2,2015,1,6,Weekend,2300,2-Car,Non-incapacitating,DISREGARD SIGNAL/REG SIGN,BASSWOOD & BLOOMFIELD,2
3,2015,1,7,Weekend,900,2-Car,Non-incapacitating,FAILURE TO YIELD RIGHT OF WAY,GATES & JACOBS,3
4,2015,1,7,Weekend,1100,2-Car,No injury/unknown,FAILURE TO YIELD RIGHT OF WAY,W 3RD,4


In [4]:
# Expectation 1 : Column `Unique_ID` must be unique (to be unique)

validator.expect_column_values_to_be_unique('unique_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 29381,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [5]:
# Expectation 2 : Column `hour` must be less than 2300 (to be between)

validator.expect_column_values_to_be_between(
    column='hour', min_value=0, max_value=2300
    )

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 29381,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
# Expectation 3 : Column `month` must contain one of the following 12 things 
validator.expect_column_values_to_be_in_set('month', [1,2,3,4,5,6,7,8,9,10,11,12])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 29381,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Expectation 4 : Column `year` must in form of int64

validator.expect_column_values_to_be_in_type_list('year', ['int64'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 5 : Column `injury_type` have unique value length 0 or 4

validator.expect_column_unique_value_count_to_be_between('injury_type',1,4)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 4
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Expectation 6 : Column `collision_type` must have values like to be in set 

validator.expect_column_distinct_values_to_be_in_set('collision_type',['2-Car', '1-Car', '3+ Cars', 'Pedestrian', 'Cyclist', 'Bus',
       'Moped/Motorcycle'])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "1-Car",
      "2-Car",
      "3+ Cars",
      "Bus",
      "Cyclist",
      "Moped/Motorcycle",
      "Pedestrian"
    ],
    "details": {
      "value_counts": [
        {
          "value": "1-Car",
          "count": 6055
        },
        {
          "value": "2-Car",
          "count": 19746
        },
        {
          "value": "3+ Cars",
          "count": 1678
        },
        {
          "value": "Bus",
          "count": 510
        },
        {
          "value": "Cyclist",
          "count": 338
        },
        {
          "value": "Moped/Motorcycle",
          "count": 605
        },
        {
          "value": "Pedestrian",
          "count": 449
        }
      ]
    }
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Expectation 7 : Column `year` must have median between 2012 or 2015

validator.expect_column_median_to_be_between('year',2012,2015 )

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 2012.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}