## The pandas Ecosystem

In [33]:
import pandas as pd
import numpy as np
import pyarrow as pa

### Foundational Libraries

#### NumPy

In [34]:
arr = np.arange(1, 10).reshape(3, -1)
df = pd.DataFrame(arr)

df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [35]:
df.to_numpy()

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [36]:
np.log(df)

Unnamed: 0,0,1,2
0,0.0,0.693147,1.098612
1,1.386294,1.609438,1.791759
2,1.94591,2.079442,2.197225


### PyArrow

In [37]:
tbl = pa.Table.from_pandas(df)
tbl

pyarrow.Table
0: int64
1: int64
2: int64
----
0: [[1,4,7]]
1: [[2,5,8]]
2: [[3,6,9]]

In [38]:
tbl.to_pandas()

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


### Exploratory Data Analysis

#### YData Profiling

In [39]:
df = pd.read_csv(
    "../data/vehicles.csv",
    dtype_backend="numpy_nullable",
    usecols=[
        "id",
        "engId",
        "make",
        "model",
        "cylinders",
        "city08",
        "highway08",
        "year",
        "trany",
    ]
)
df.head()

Unnamed: 0,city08,cylinders,engId,highway08,id,make,model,trany,year
0,19,4,9011,25,1,Alfa Romeo,Spider Veloce 2000,Manual 5-spd,1985
1,9,12,22020,14,10,Ferrari,Testarossa,Manual 5-spd,1985
2,23,4,2100,33,100,Dodge,Charger,Manual 5-spd,1985
3,10,8,2850,12,1000,Dodge,B150/B250 Wagon 2WD,Automatic 3-spd,1985
4,17,4,66031,23,10000,Subaru,Legacy AWD Turbo,Manual 5-spd,1993


In [40]:
# from ydata_profiling import ProfileReport
# profile = ProfileReport(df, title="Vehicles Profile Report")
# profile.to_file("vehicles_profile.html")

### Data Validation
- Great Expectations

In [41]:
df = pd.read_csv(
    "../data/vehicles.csv",
    dtype_backend="numpy_nullable",
    dtype={
        "rangeA": pd.StringDtype(),
        "mfrCode": pd.StringDtype(),
        "c240Dscr": pd.StringDtype(),
        "c240bDscr": pd.StringDtype()
    }
)
df.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,14.167143,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,27.046364,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,11.018889,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,27.046364,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,15.658421,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


### Run this python version 3.9 to 3.12

In [42]:
# import great_expectations as gx
# context = gx.get_context()

In [43]:
# datasource = context.data_sources.add_pandas(name="pandas_datasource")
# data_asset = datasource.add_dataframe_asset(name="vehicles")

In [44]:
# batch_definition_name = "dataframe_definition"
# batch_definition = data_asset.add_batch_definition_whole_dataframe(
#     batch_definition_name
# )
# batch = batch_definition.get_batch(batch_parameters={
#     "dataframe": df
# })

In [45]:
# city_exp = gx.expectations.ExpectColumnValuesToNotBeNull(
#     column="city08"
# )
# result = batch.validate(city_exp)
# print(result)

In [46]:
# cylinders_exp = gx.expectations.ExpectColumnValuesToNotBeNull(
#     column="cylinders"
# )
# result = batch.validate(cylinders_exp)
# print(result)

### Visualization

In [47]:
df = pd.read_csv(
    "../data/vehicles.csv",
    dtype_backend="numpy_nullable",
    dtype={
        "rangeA": pd.StringDtype(),
        "mfrCode": pd.StringDtype(),
        "c240Dscr": pd.StringDtype(),
        "c240bDscr": pd.StringDtype()
    }
)

df.head()

Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,14.167143,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,27.046364,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,11.018889,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,27.046364,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,15.658421,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


### Plotly

In [48]:
df.plot(
    kind="scatter",
    x="city08",
    y="highway08",
    backend="plotly",
    hover_data={"make": True, "model": True, "year": True},
)