Skip to content

Commit

Permalink
[Feat]: Support Polars DataFrame and make Pandas and Polars optional
Browse files Browse the repository at this point in the history
  • Loading branch information
0warning0error committed Mar 2, 2023
1 parent 7c691f6 commit a09ae0b
Show file tree
Hide file tree
Showing 13 changed files with 4,534 additions and 2,531 deletions.
8 changes: 7 additions & 1 deletion README.md
Expand Up @@ -27,7 +27,7 @@
</a>
</p>

[**PyGWalker**](https://github.com/Kanaries/pygwalker) can simplify your Jupyter Notebook data analysis and data visualization workflow, by turning your pandas dataframe into a Tableau-style User Interface for visual exploration.
[**PyGWalker**](https://github.com/Kanaries/pygwalker) can simplify your Jupyter Notebook data analysis and data visualization workflow, by turning your pandas dataframe (and polars dataframe) into a Tableau-style User Interface for visual exploration.

**PyGWalker** (pronounced like "Pig Walker", just for fun) is named as an abbreviation of "**Py**thon binding of **G**raphic **Walker**". It integrates Jupyter Notebook (or other jupyter-based notebooks) with [Graphic Walker](https://github.com/Kanaries/graphic-walker), a different type of open-source alternative to Tableau. It allows data scientists to analyze data and visualize patterns with simple drag-and-drop operations.

Expand Down Expand Up @@ -116,6 +116,12 @@ df = pd.read_csv('./bike_sharing_dc.csv', parse_dates=['date'])
gwalker = pyg.walk(df)
```

And you can use pygwalker with polars:
```python
import polars as pl
df = pl.read_csv('./bike_sharing_dc.csv',try_parse_dates = True)
gwalker = pyg.walk(df)
```
You can even try it online, simply visiting [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Kanaries/pygwalker/main?labpath=tests%2Fmain.ipynb), [Google Colab](https://colab.research.google.com/drive/171QUQeq-uTLgSj1u-P9DQig7Md1kpXQ2?usp=sharing) or [Kaggle Code](https://www.kaggle.com/code/asmdef/notebook1cc9d36936).

<!-- ![](https://docs-us.oss-us-west-1.aliyuncs.com/img/pygwalker/screenshot-top-img.png) -->
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Expand Up @@ -8,5 +8,6 @@ dependencies:
- pandas
- python>=3.5
- pip
- polars
- pip:
- pygwalker>=0.1
1,153 changes: 376 additions & 777 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion pygwalker/base.py
@@ -1,5 +1,4 @@
import os, sys, json
import pandas as pd
import typing as tp
import IPython
from IPython.display import display, Javascript, HTML, IFrame
Expand Down
12 changes: 6 additions & 6 deletions pygwalker/gwalker.py
Expand Up @@ -3,11 +3,11 @@
from .utils.gwalker_props import get_props
from .utils.render import render_gwalker_html

def to_html(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
def to_html(df: "pl.DataFrame | pd.DataFrame", gid: tp.Union[int, str]=None, **kwargs):
"""Generate embeddable HTML code of Graphic Walker with data of `df`.
Args:
df (pd.DataFrame, optional): dataframe.
df (pl.DataFrame | pd.DataFrame , optional): dataframe.
gid (tp.Union[int, str], optional): GraphicWalker container div's id ('gwalker-{gid}')
hideDataSourceConfig (bool, optional): Hide DataSource import and export button (True) or not (False). Default to True
"""
Expand All @@ -19,11 +19,11 @@ def to_html(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
html = render_gwalker_html(gid, props)
return html

def walk(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
def walk(df: "pl.DataFrame | pd.DataFrame", gid: tp.Union[int, str]=None, **kwargs):
"""walk through pandas.DataFrame df with Graphic Walker
Args:
df (pd.DataFrame, optional): dataframe.
df (pl.DataFrame | pd.DataFrame, optional): dataframe.
gid (tp.Union[int, str], optional): GraphicWalker container div's id ('gwalker-{gid}')
hideDataSourceConfig (bool, optional): Hide DataSource import and export button (True) or not (False). Default to True
return_html (bool, optional): Directly return a html string. Defaults to False.
Expand All @@ -40,7 +40,7 @@ def walk(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
display(HTML(html))

class GWalker:
def __init__(self, df: pd.DataFrame=None, **kwargs):
def __init__(self, df: "pl.DataFrame | pd.DataFrame"=None, **kwargs):
global global_gid
self.gid = global_gid
global_gid += 1
Expand All @@ -54,7 +54,7 @@ def walk(self, **kwargs):
html = self.to_html(**kwargs)
display(HTML(html))

def update(self, df: pd.DataFrame=None, **kwargs):
def update(self, df: "pl.DataFrame | pd.DataFrame"=None, **kwargs):
pass

@property
Expand Down
1,276 changes: 0 additions & 1,276 deletions pygwalker/templates/graphic-walker.iife.js

This file was deleted.

245 changes: 191 additions & 54 deletions pygwalker/utils/gwalker_props.py
@@ -1,57 +1,194 @@
from ..base import *
from .fname_encodings import fname_decode, fname_encode

def infer_prop(s: pd.Series, i=None) -> tp.Dict:
"""get IMutField
Args:
s (pd.Series): the column
i (int, optional): column id. Defaults to None.
Returns:
tp.Dict: _description_
"""
kind = s.dtype.kind
# print(f'{s.name}: type={s.dtype}, kind={s.dtype.kind}')
v_cnt = len(s.value_counts())
semanticType = 'quantitative' if \
(kind in 'fcmiu' and v_cnt > 16) \
else 'temporal' if kind in 'M' \
else 'nominal' if kind in 'bOSUV' or v_cnt <= 2 \
else 'ordinal'
# 'quantitative' | 'nominal' | 'ordinal' | 'temporal';
analyticType = 'measure' if \
kind in 'fcm' or (kind in 'iu' and len(s.value_counts()) > 16) \
else 'dimension'
import json
fname = fname_decode(s.name)
fname = json.dumps(fname, ensure_ascii=False)[1:-1]
return {
'fid': s.name, # f'col-{i}-{s.name}' if i is not None else s.name,
'name': fname,
'semanticType': semanticType,
'analyticType': analyticType
}

def to_records(df: pd.DataFrame):
df = df.replace({float('nan'): None})
return df.to_dict(orient='records')

def raw_fields(df: pd.DataFrame):
return [
infer_prop(df[col], i)
for i, col in enumerate(df.columns)
]

def get_props(df: pd.DataFrame, **kwargs):
df = df.reset_index()
df = df.rename(fname_encode, axis='columns')
props = {
'dataSource': to_records(df),
'rawFields': raw_fields(df),
'hideDataSourceConfig': kwargs.get('hideDataSourceConfig', True),
'fieldkeyGuard': False,
'themeKey': 'g2',
**kwargs,
}
return props
class PandasDataFramePropGetter:
@classmethod
def infer_prop(cls,s: "pl.Series", i=None) -> tp.Dict:
"""get IMutField
Args:
s (pd.Series): the column
i (int, optional): column id. Defaults to None.
Returns:
tp.Dict: _description_
"""
kind = s.dtype.kind
# print(f'{s.name}: type={s.dtype}, kind={s.dtype.kind}')
v_cnt = len(s.value_counts())
semanticType = 'quantitative' if \
(kind in 'fcmiu' and v_cnt > 16) \
else 'temporal' if kind in 'M' \
else 'nominal' if kind in 'bOSUV' or v_cnt <= 2 \
else 'ordinal'
# 'quantitative' | 'nominal' | 'ordinal' | 'temporal';
analyticType = 'measure' if \
kind in 'fcm' or (kind in 'iu' and len(s.value_counts()) > 16) \
else 'dimension'
import json
fname = fname_decode(s.name)
fname = json.dumps(fname, ensure_ascii=False)[1:-1]
return {
'fid': s.name, # f'col-{i}-{s.name}' if i is not None else s.name,
'name': fname,
'semanticType': semanticType,
'analyticType': analyticType
}
@classmethod
def to_records(cls,df: "pd.DataFrame"):
df = df.replace({float('nan'): None})
return df.to_dict(orient='records')

@classmethod
def raw_fields(cls,df: "pd.DataFrame"):
return [
cls.infer_prop(df[col], i)
for i, col in enumerate(df.columns)
]

@classmethod
def get_props(cls,df: "pd.DataFrame", **kwargs):
df = df.reset_index()
df = df.rename(fname_encode, axis='columns')
props = {
'dataSource': cls.to_records(df),
'rawFields': cls.raw_fields(df),
'hideDataSourceConfig': kwargs.get('hideDataSourceConfig', True),
'fieldkeyGuard': False,
'themeKey': 'g2',
**kwargs,
}
return props

class PolarsDataFramePropGetter:
@classmethod
def infer_prop(cls,s: "pl.Series", i=None) -> tp.Dict:
"""get IMutField
Args:
s (pl.Series): the column
i (int, optional): column id. Defaults to None.
Returns:
tp.Dict: _description_
"""
kind = s.dtype
# print(f'{s.name}: type={s.dtype}, kind={s.dtype.kind}')
v_cnt = len(s.value_counts())
semanticType = 'quantitative' if \
(kind in [pl.Int16,pl.Int32,pl.Int64,
pl.UInt8,pl.UInt16,pl.UInt32,
pl.UInt64,pl.Duration,pl.Float32,pl.Float64] and v_cnt > 16) \
else 'temporal' if kind in [pl.Datetime,pl.Date,pl.Time] \
else 'nominal' if kind in [pl.Boolean,pl.Object,pl.Utf8,pl.Categorical,pl.Struct,pl.List] or v_cnt <= 2 \
else 'ordinal'
# 'quantitative' | 'nominal' | 'ordinal' | 'temporal';
analyticType = 'measure' if \
kind in [pl.Float32,pl.Float64,pl.Duration] or (kind in [pl.Int16,pl.Int32,pl.Int64,
pl.UInt8,pl.UInt16,pl.UInt32,
pl.UInt64] and v_cnt > 16) \
else 'dimension'
import json
fname = fname_decode(s.name)
fname = json.dumps(fname, ensure_ascii=False)[1:-1]
return {
'fid': s.name, # f'col-{i}-{s.name}' if i is not None else s.name,
'name': fname,
'semanticType': semanticType,
'analyticType': analyticType
}

@classmethod
def to_records(cls,df: "pl.DataFrame"):
df = df.fill_nan(None)
return df.to_dicts()
# old style
@classmethod
def raw_fields(cls,df: "pl.DataFrame"):
return [
cls.infer_prop(df[col], i)
for i, col in enumerate(df.columns)
]
#new style using parallel
@classmethod
def raw_fields(cls,df: "pl.DataFrame"):
def colname2fname(s):
import json
fname = fname_decode(s)
fname = json.dumps(fname, ensure_ascii=False)[1:-1]
return fname
def colname2semanticType(kind,v_cnt):
semanticType = 'quantitative' if \
(kind in ["Int16","Int32","Int64",
"UInt8","UInt16","UInt32",
"UInt64","Duration","Float32","Float64"] and v_cnt > 16) \
else 'temporal' if kind in ["Datetime","Date","Time"] \
else 'nominal' if kind in ["Boolean","Object","Utf8","Categorical","Struct","List"] or v_cnt <= 2 \
else 'ordinal'
return semanticType
def colname2analyticType(kind,v_cnt):
analyticType = 'measure' if \
kind in ["Float32","Float64","Duration"] or (kind in ["Int16","Int32","Int64",
"UInt8","UInt16","UInt32",
"UInt64"] and v_cnt > 16) \
else 'dimension'
return analyticType
def type2str(t):
for root_type in (pl.Int16,pl.Int32,pl.Int64,
pl.UInt8,pl.UInt16,pl.UInt32,
pl.UInt64,pl.Float32,pl.Float64,
pl.Datetime,pl.Boolean,pl.Object,pl.Utf8,pl.Date,
pl.Categorical,pl.Struct,pl.List,pl.Duration):
if t == root_type:
return str(root_type)
return str(t)
counts_data = df.select([pl.col("*").value_counts().count()]).row(0)

col_info = pl.DataFrame({"fid":df.schema.keys(),
# directly storing type instance and appling a function will lead to segmentfault
"kind":map(type2str,df.schema.values()),
"v_cnt":counts_data},)
result_data = col_info.select([
pl.col("fid"),
pl.col("fid").apply(colname2fname).alias("name"),
pl.struct(["kind","v_cnt"]).apply(lambda d:colname2semanticType(**d)).alias("semanticType"),
pl.struct(["kind","v_cnt"]).apply(lambda d:colname2analyticType(**d)).alias("analyticType")
])
return result_data.to_dicts()

@classmethod
def get_props(cls,df: "pl.DataFrame", **kwargs):
df = df.rename({i : fname_encode(i) for i in df.columns})
props = {
'dataSource': cls.to_records(df),
'rawFields': cls.raw_fields(df),
'hideDataSourceConfig': kwargs.get('hideDataSourceConfig', True),
'fieldkeyGuard': False,
'themeKey': 'g2',
**kwargs,
}
return props

class OtherPropGetter:
@classmethod
def get_props(cls,df: "Any", **kwargs):
return {}

__classname2method = {}
def get_props(df: "pl.DataFrame | pd.DataFrame" , **kwargs):
df_type = type(df)
props = __classname2method.get(df_type,OtherPropGetter).get_props(df,**kwargs)
return props


for module_name in ("pandas","polars"):
try:
exec("{0} = __import__('{0}')".format(module_name))
if module_name == "polars":
import polars as pl
__classname2method[pl.DataFrame] = PolarsDataFramePropGetter
elif module_name == "pandas":
import pandas as pd
__classname2method[pd.DataFrame] = PandasDataFramePropGetter
except ImportError:
continue
3 changes: 2 additions & 1 deletion pygwalker/utils/render.py
@@ -1,4 +1,5 @@
from ..base import *
import datetime

def gwalker_script():
global gwalker_js
Expand All @@ -15,7 +16,7 @@ def gwalker_script():

class DataFrameEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, pd.Timestamp):
if isinstance(obj, (datetime.datetime,datetime.date,datetime.time)):
return str(obj)
return json.JSONEncoder.default(self, obj)

Expand Down
7 changes: 6 additions & 1 deletion pyproject.toml
Expand Up @@ -14,9 +14,14 @@ classifiers = [
[tool.poetry.dependencies]
python = "^3.5"
jinja2 = "*"
pandas = "*"
ipython = "*"
pandas = { version = "*", optional = true }
polars = { version = "*", optional = true }

[tool.poetry.extras]
pandas = ["pandas"]
polars = ["polars"]
all = ["pandas", "polars"]

[tool.poetry.group.dev.dependencies]
build = "*"
Expand Down

0 comments on commit a09ae0b

Please sign in to comment.