Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat]: Support Polars DataFrame and make Pandas and Polars optional #57

Merged
merged 1 commit into from Mar 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion README.md
Expand Up @@ -27,7 +27,7 @@
</a>
</p>

[**PyGWalker**](https://github.com/Kanaries/pygwalker) can simplify your Jupyter Notebook data analysis and data visualization workflow, by turning your pandas dataframe into a Tableau-style User Interface for visual exploration.
[**PyGWalker**](https://github.com/Kanaries/pygwalker) can simplify your Jupyter Notebook data analysis and data visualization workflow, by turning your pandas dataframe (and polars dataframe) into a Tableau-style User Interface for visual exploration.

**PyGWalker** (pronounced like "Pig Walker", just for fun) is named as an abbreviation of "**Py**thon binding of **G**raphic **Walker**". It integrates Jupyter Notebook (or other jupyter-based notebooks) with [Graphic Walker](https://github.com/Kanaries/graphic-walker), a different type of open-source alternative to Tableau. It allows data scientists to analyze data and visualize patterns with simple drag-and-drop operations.

Expand Down Expand Up @@ -116,6 +116,12 @@ df = pd.read_csv('./bike_sharing_dc.csv', parse_dates=['date'])
gwalker = pyg.walk(df)
```

And you can use pygwalker with polars:
```python
import polars as pl
df = pl.read_csv('./bike_sharing_dc.csv',try_parse_dates = True)
gwalker = pyg.walk(df)
```
You can even try it online, simply visiting [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Kanaries/pygwalker/main?labpath=tests%2Fmain.ipynb), [Google Colab](https://colab.research.google.com/drive/171QUQeq-uTLgSj1u-P9DQig7Md1kpXQ2?usp=sharing) or [Kaggle Code](https://www.kaggle.com/code/asmdef/notebook1cc9d36936).

<!-- ![](https://docs-us.oss-us-west-1.aliyuncs.com/img/pygwalker/screenshot-top-img.png) -->
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Expand Up @@ -8,5 +8,6 @@ dependencies:
- pandas
- python>=3.5
- pip
- polars
- pip:
- pygwalker>=0.1
1,153 changes: 376 additions & 777 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion pygwalker/base.py
@@ -1,5 +1,4 @@
import os, sys, json
import pandas as pd
import typing as tp
import IPython
from IPython.display import display, Javascript, HTML, IFrame
Expand Down
12 changes: 6 additions & 6 deletions pygwalker/gwalker.py
Expand Up @@ -3,11 +3,11 @@
from .utils.gwalker_props import get_props
from .utils.render import render_gwalker_html

def to_html(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
def to_html(df: "pl.DataFrame | pd.DataFrame", gid: tp.Union[int, str]=None, **kwargs):
"""Generate embeddable HTML code of Graphic Walker with data of `df`.

Args:
df (pd.DataFrame, optional): dataframe.
df (pl.DataFrame | pd.DataFrame , optional): dataframe.
gid (tp.Union[int, str], optional): GraphicWalker container div's id ('gwalker-{gid}')
hideDataSourceConfig (bool, optional): Hide DataSource import and export button (True) or not (False). Default to True
"""
Expand All @@ -19,11 +19,11 @@ def to_html(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
html = render_gwalker_html(gid, props)
return html

def walk(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
def walk(df: "pl.DataFrame | pd.DataFrame", gid: tp.Union[int, str]=None, **kwargs):
"""walk through pandas.DataFrame df with Graphic Walker

Args:
df (pd.DataFrame, optional): dataframe.
df (pl.DataFrame | pd.DataFrame, optional): dataframe.
gid (tp.Union[int, str], optional): GraphicWalker container div's id ('gwalker-{gid}')
hideDataSourceConfig (bool, optional): Hide DataSource import and export button (True) or not (False). Default to True
return_html (bool, optional): Directly return a html string. Defaults to False.
Expand All @@ -40,7 +40,7 @@ def walk(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
display(HTML(html))

class GWalker:
def __init__(self, df: pd.DataFrame=None, **kwargs):
def __init__(self, df: "pl.DataFrame | pd.DataFrame"=None, **kwargs):
global global_gid
self.gid = global_gid
global_gid += 1
Expand All @@ -54,7 +54,7 @@ def walk(self, **kwargs):
html = self.to_html(**kwargs)
display(HTML(html))

def update(self, df: pd.DataFrame=None, **kwargs):
def update(self, df: "pl.DataFrame | pd.DataFrame"=None, **kwargs):
pass

@property
Expand Down
245 changes: 191 additions & 54 deletions pygwalker/utils/gwalker_props.py
@@ -1,57 +1,194 @@
from ..base import *
from .fname_encodings import fname_decode, fname_encode

def infer_prop(s: pd.Series, i=None) -> tp.Dict:
"""get IMutField

Args:
s (pd.Series): the column
i (int, optional): column id. Defaults to None.

Returns:
tp.Dict: _description_
"""
kind = s.dtype.kind
# print(f'{s.name}: type={s.dtype}, kind={s.dtype.kind}')
v_cnt = len(s.value_counts())
semanticType = 'quantitative' if \
(kind in 'fcmiu' and v_cnt > 16) \
else 'temporal' if kind in 'M' \
else 'nominal' if kind in 'bOSUV' or v_cnt <= 2 \
else 'ordinal'
# 'quantitative' | 'nominal' | 'ordinal' | 'temporal';
analyticType = 'measure' if \
kind in 'fcm' or (kind in 'iu' and len(s.value_counts()) > 16) \
else 'dimension'
import json
fname = fname_decode(s.name)
fname = json.dumps(fname, ensure_ascii=False)[1:-1]
return {
'fid': s.name, # f'col-{i}-{s.name}' if i is not None else s.name,
'name': fname,
'semanticType': semanticType,
'analyticType': analyticType
}

def to_records(df: pd.DataFrame):
df = df.replace({float('nan'): None})
return df.to_dict(orient='records')

def raw_fields(df: pd.DataFrame):
return [
infer_prop(df[col], i)
for i, col in enumerate(df.columns)
]

def get_props(df: pd.DataFrame, **kwargs):
df = df.reset_index()
df = df.rename(fname_encode, axis='columns')
props = {
'dataSource': to_records(df),
'rawFields': raw_fields(df),
'hideDataSourceConfig': kwargs.get('hideDataSourceConfig', True),
'fieldkeyGuard': False,
'themeKey': 'g2',
**kwargs,
}
return props
class PandasDataFramePropGetter:
@classmethod
def infer_prop(cls,s: "pl.Series", i=None) -> tp.Dict:
"""get IMutField

Args:
s (pd.Series): the column
i (int, optional): column id. Defaults to None.

Returns:
tp.Dict: _description_
"""
kind = s.dtype.kind
# print(f'{s.name}: type={s.dtype}, kind={s.dtype.kind}')
v_cnt = len(s.value_counts())
semanticType = 'quantitative' if \
(kind in 'fcmiu' and v_cnt > 16) \
else 'temporal' if kind in 'M' \
else 'nominal' if kind in 'bOSUV' or v_cnt <= 2 \
else 'ordinal'
# 'quantitative' | 'nominal' | 'ordinal' | 'temporal';
analyticType = 'measure' if \
kind in 'fcm' or (kind in 'iu' and len(s.value_counts()) > 16) \
else 'dimension'
import json
fname = fname_decode(s.name)
fname = json.dumps(fname, ensure_ascii=False)[1:-1]
return {
'fid': s.name, # f'col-{i}-{s.name}' if i is not None else s.name,
'name': fname,
'semanticType': semanticType,
'analyticType': analyticType
}
@classmethod
def to_records(cls,df: "pd.DataFrame"):
df = df.replace({float('nan'): None})
return df.to_dict(orient='records')

@classmethod
def raw_fields(cls,df: "pd.DataFrame"):
return [
cls.infer_prop(df[col], i)
for i, col in enumerate(df.columns)
]

@classmethod
def get_props(cls,df: "pd.DataFrame", **kwargs):
df = df.reset_index()
df = df.rename(fname_encode, axis='columns')
props = {
'dataSource': cls.to_records(df),
'rawFields': cls.raw_fields(df),
'hideDataSourceConfig': kwargs.get('hideDataSourceConfig', True),
'fieldkeyGuard': False,
'themeKey': 'g2',
**kwargs,
}
return props

class PolarsDataFramePropGetter:
@classmethod
def infer_prop(cls,s: "pl.Series", i=None) -> tp.Dict:
"""get IMutField

Args:
s (pl.Series): the column
i (int, optional): column id. Defaults to None.

Returns:
tp.Dict: _description_
"""
kind = s.dtype
# print(f'{s.name}: type={s.dtype}, kind={s.dtype.kind}')
v_cnt = len(s.value_counts())
semanticType = 'quantitative' if \
(kind in [pl.Int16,pl.Int32,pl.Int64,
pl.UInt8,pl.UInt16,pl.UInt32,
pl.UInt64,pl.Duration,pl.Float32,pl.Float64] and v_cnt > 16) \
else 'temporal' if kind in [pl.Datetime,pl.Date,pl.Time] \
else 'nominal' if kind in [pl.Boolean,pl.Object,pl.Utf8,pl.Categorical,pl.Struct,pl.List] or v_cnt <= 2 \
else 'ordinal'
# 'quantitative' | 'nominal' | 'ordinal' | 'temporal';
analyticType = 'measure' if \
kind in [pl.Float32,pl.Float64,pl.Duration] or (kind in [pl.Int16,pl.Int32,pl.Int64,
pl.UInt8,pl.UInt16,pl.UInt32,
pl.UInt64] and v_cnt > 16) \
else 'dimension'
import json
fname = fname_decode(s.name)
fname = json.dumps(fname, ensure_ascii=False)[1:-1]
return {
'fid': s.name, # f'col-{i}-{s.name}' if i is not None else s.name,
'name': fname,
'semanticType': semanticType,
'analyticType': analyticType
}

@classmethod
def to_records(cls,df: "pl.DataFrame"):
df = df.fill_nan(None)
return df.to_dicts()
# old style
@classmethod
def raw_fields(cls,df: "pl.DataFrame"):
return [
cls.infer_prop(df[col], i)
for i, col in enumerate(df.columns)
]
#new style using parallel
@classmethod
def raw_fields(cls,df: "pl.DataFrame"):
def colname2fname(s):
import json
fname = fname_decode(s)
fname = json.dumps(fname, ensure_ascii=False)[1:-1]
return fname
def colname2semanticType(kind,v_cnt):
semanticType = 'quantitative' if \
(kind in ["Int16","Int32","Int64",
"UInt8","UInt16","UInt32",
"UInt64","Duration","Float32","Float64"] and v_cnt > 16) \
else 'temporal' if kind in ["Datetime","Date","Time"] \
else 'nominal' if kind in ["Boolean","Object","Utf8","Categorical","Struct","List"] or v_cnt <= 2 \
else 'ordinal'
return semanticType
def colname2analyticType(kind,v_cnt):
analyticType = 'measure' if \
kind in ["Float32","Float64","Duration"] or (kind in ["Int16","Int32","Int64",
"UInt8","UInt16","UInt32",
"UInt64"] and v_cnt > 16) \
else 'dimension'
return analyticType
def type2str(t):
for root_type in (pl.Int16,pl.Int32,pl.Int64,
pl.UInt8,pl.UInt16,pl.UInt32,
pl.UInt64,pl.Float32,pl.Float64,
pl.Datetime,pl.Boolean,pl.Object,pl.Utf8,pl.Date,
pl.Categorical,pl.Struct,pl.List,pl.Duration):
if t == root_type:
return str(root_type)
return str(t)
counts_data = df.select([pl.col("*").value_counts().count()]).row(0)

col_info = pl.DataFrame({"fid":df.schema.keys(),
# directly storing type instance and appling a function will lead to segmentfault
"kind":map(type2str,df.schema.values()),
"v_cnt":counts_data},)
result_data = col_info.select([
pl.col("fid"),
pl.col("fid").apply(colname2fname).alias("name"),
pl.struct(["kind","v_cnt"]).apply(lambda d:colname2semanticType(**d)).alias("semanticType"),
pl.struct(["kind","v_cnt"]).apply(lambda d:colname2analyticType(**d)).alias("analyticType")
])
return result_data.to_dicts()

@classmethod
def get_props(cls,df: "pl.DataFrame", **kwargs):
df = df.rename({i : fname_encode(i) for i in df.columns})
props = {
'dataSource': cls.to_records(df),
'rawFields': cls.raw_fields(df),
'hideDataSourceConfig': kwargs.get('hideDataSourceConfig', True),
'fieldkeyGuard': False,
'themeKey': 'g2',
**kwargs,
}
return props

class OtherPropGetter:
@classmethod
def get_props(cls,df: "Any", **kwargs):
return {}

__classname2method = {}
def get_props(df: "pl.DataFrame | pd.DataFrame" , **kwargs):
df_type = type(df)
props = __classname2method.get(df_type,OtherPropGetter).get_props(df,**kwargs)
return props


for module_name in ("pandas","polars"):
try:
__import__(module_name)
if module_name == "polars":
import polars as pl
__classname2method[pl.DataFrame] = PolarsDataFramePropGetter
elif module_name == "pandas":
import pandas as pd
__classname2method[pd.DataFrame] = PandasDataFramePropGetter
except ImportError:
continue
3 changes: 2 additions & 1 deletion pygwalker/utils/render.py
@@ -1,4 +1,5 @@
from ..base import *
import datetime

def gwalker_script():
global gwalker_js
Expand All @@ -15,7 +16,7 @@ def gwalker_script():

class DataFrameEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, pd.Timestamp):
if isinstance(obj, (datetime.datetime,datetime.date,datetime.time)):
return str(obj)
return json.JSONEncoder.default(self, obj)

Expand Down
7 changes: 6 additions & 1 deletion pyproject.toml
Expand Up @@ -14,9 +14,14 @@ classifiers = [
[tool.poetry.dependencies]
python = "^3.5"
jinja2 = "*"
pandas = "*"
ipython = "*"
pandas = { version = "*", optional = true }
polars = { version = "*", optional = true }

[tool.poetry.extras]
pandas = ["pandas"]
polars = ["polars"]
all = ["pandas", "polars"]

[tool.poetry.group.dev.dependencies]
build = "*"
Expand Down