[Feat]: Support Polars DataFrame and make Pandas and Polars optional

Kanaries · Mar 2, 2023 · a09ae0b · a09ae0b
1 parent 7c691f6
commit a09ae0b
Show file tree

Hide file tree

Showing 13 changed files with 4,534 additions and 2,531 deletions.
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@
     </a>
 </p>
 
-[**PyGWalker**](https://github.com/Kanaries/pygwalker) can simplify your Jupyter Notebook data analysis and data visualization workflow, by turning your pandas dataframe into a Tableau-style User Interface for visual exploration.
+[**PyGWalker**](https://github.com/Kanaries/pygwalker) can simplify your Jupyter Notebook data analysis and data visualization workflow, by turning your pandas dataframe (and polars dataframe) into a Tableau-style User Interface for visual exploration.
 
 **PyGWalker** (pronounced like "Pig Walker", just for fun) is named as an abbreviation of "**Py**thon binding of **G**raphic **Walker**". It integrates Jupyter Notebook (or other jupyter-based notebooks) with [Graphic Walker](https://github.com/Kanaries/graphic-walker), a different type of open-source alternative to Tableau. It allows data scientists to analyze data and visualize patterns with simple drag-and-drop operations.
 
@@ -116,6 +116,12 @@ df = pd.read_csv('./bike_sharing_dc.csv', parse_dates=['date'])
 gwalker = pyg.walk(df)
 ```
 
+And you can use pygwalker with polars:
+```python
+import polars as pl
+df = pl.read_csv('./bike_sharing_dc.csv',try_parse_dates = True)
+gwalker = pyg.walk(df)
+```
 You can even try it online, simply visiting [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Kanaries/pygwalker/main?labpath=tests%2Fmain.ipynb), [Google Colab](https://colab.research.google.com/drive/171QUQeq-uTLgSj1u-P9DQig7Md1kpXQ2?usp=sharing) or [Kaggle Code](https://www.kaggle.com/code/asmdef/notebook1cc9d36936).
 
 <!-- ![](https://docs-us.oss-us-west-1.aliyuncs.com/img/pygwalker/screenshot-top-img.png) -->

diff --git a/environment.yml b/environment.yml
@@ -8,5 +8,6 @@ dependencies:
   - pandas
   - python>=3.5
   - pip
+  - polars
   - pip:
     - pygwalker>=0.1
diff --git a/poetry.lock b/poetry.lock
diff --git a/pygwalker/base.py b/pygwalker/base.py
@@ -1,5 +1,4 @@
 import os, sys, json
-import pandas as pd
 import typing as tp
 import IPython
 from IPython.display import display, Javascript, HTML, IFrame

diff --git a/pygwalker/gwalker.py b/pygwalker/gwalker.py
@@ -3,11 +3,11 @@
 from .utils.gwalker_props import get_props
 from .utils.render import render_gwalker_html
 
-def to_html(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
+def to_html(df: "pl.DataFrame | pd.DataFrame", gid: tp.Union[int, str]=None, **kwargs):
     """Generate embeddable HTML code of Graphic Walker with data of `df`.
 
     Args:
-        df (pd.DataFrame, optional): dataframe.
+        df (pl.DataFrame | pd.DataFrame , optional): dataframe.
         gid (tp.Union[int, str], optional): GraphicWalker container div's id ('gwalker-{gid}')
         hideDataSourceConfig (bool, optional): Hide DataSource import and export button (True) or not (False). Default to True
     """
@@ -19,11 +19,11 @@ def to_html(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
     html = render_gwalker_html(gid, props)
     return html
 
-def walk(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
+def walk(df: "pl.DataFrame | pd.DataFrame", gid: tp.Union[int, str]=None, **kwargs):
     """walk through pandas.DataFrame df with Graphic Walker
 
     Args:
-        df (pd.DataFrame, optional): dataframe.
+        df (pl.DataFrame | pd.DataFrame, optional): dataframe.
         gid (tp.Union[int, str], optional): GraphicWalker container div's id ('gwalker-{gid}')
         hideDataSourceConfig (bool, optional): Hide DataSource import and export button (True) or not (False). Default to True
         return_html (bool, optional): Directly return a html string. Defaults to False.
@@ -40,7 +40,7 @@ def walk(df: pd.DataFrame, gid: tp.Union[int, str]=None, **kwargs):
         display(HTML(html))
 
 class GWalker:
-    def __init__(self, df: pd.DataFrame=None, **kwargs):
+    def __init__(self, df: "pl.DataFrame | pd.DataFrame"=None, **kwargs):
         global global_gid
         self.gid = global_gid
         global_gid += 1
@@ -54,7 +54,7 @@ def walk(self, **kwargs):
         html = self.to_html(**kwargs)
         display(HTML(html))
 
-    def update(self, df: pd.DataFrame=None, **kwargs):
+    def update(self, df: "pl.DataFrame | pd.DataFrame"=None, **kwargs):
         pass
 
     @property

diff --git a/pygwalker/templates/graphic-walker.iife.js b/pygwalker/templates/graphic-walker.iife.js
diff --git a/pygwalker/utils/gwalker_props.py b/pygwalker/utils/gwalker_props.py
@@ -1,57 +1,194 @@
 from ..base import *
 from .fname_encodings import fname_decode, fname_encode
 
-def infer_prop(s: pd.Series, i=None) -> tp.Dict:
-    """get IMutField
-
-    Args:
-        s (pd.Series): the column
-        i (int, optional): column id. Defaults to None.
-
-    Returns:
-        tp.Dict: _description_
-    """
-    kind = s.dtype.kind
-    # print(f'{s.name}: type={s.dtype}, kind={s.dtype.kind}')
-    v_cnt = len(s.value_counts())
-    semanticType = 'quantitative' if \
-        (kind in 'fcmiu' and v_cnt > 16) \
-            else 'temporal' if kind in 'M' \
-                else 'nominal' if kind in 'bOSUV' or v_cnt <= 2 \
-                    else 'ordinal'
-    # 'quantitative' | 'nominal' | 'ordinal' | 'temporal';
-    analyticType = 'measure' if \
-        kind in 'fcm' or (kind in 'iu' and len(s.value_counts()) > 16) \
-            else 'dimension'
-    import json
-    fname = fname_decode(s.name)
-    fname = json.dumps(fname, ensure_ascii=False)[1:-1]
-    return {
-        'fid': s.name, # f'col-{i}-{s.name}' if i is not None else s.name,
-        'name': fname,
-        'semanticType': semanticType,
-        'analyticType': analyticType
-    }
-
-def to_records(df: pd.DataFrame):
-    df = df.replace({float('nan'): None})
-    return df.to_dict(orient='records')
-
-def raw_fields(df: pd.DataFrame):
-    return [
-        infer_prop(df[col], i)
-        for i, col in enumerate(df.columns)
-    ]
-
-def get_props(df: pd.DataFrame, **kwargs):
-    df = df.reset_index()
-    df = df.rename(fname_encode, axis='columns')
-    props = {
-        'dataSource': to_records(df),
-        'rawFields': raw_fields(df),
-        'hideDataSourceConfig': kwargs.get('hideDataSourceConfig', True),
-        'fieldkeyGuard': False,
-        'themeKey': 'g2',
-        **kwargs,
-    }
-    return props
+class PandasDataFramePropGetter:
+    @classmethod
+    def infer_prop(cls,s: "pl.Series", i=None) -> tp.Dict:
+        """get IMutField
+
+        Args:
+            s (pd.Series): the column
+            i (int, optional): column id. Defaults to None.
+
+        Returns:
+            tp.Dict: _description_
+        """
+        kind = s.dtype.kind
+        # print(f'{s.name}: type={s.dtype}, kind={s.dtype.kind}')
+        v_cnt = len(s.value_counts())
+        semanticType = 'quantitative' if \
+            (kind in 'fcmiu' and v_cnt > 16) \
+                else 'temporal' if kind in 'M' \
+                    else 'nominal' if kind in 'bOSUV' or v_cnt <= 2 \
+                        else 'ordinal'
+        # 'quantitative' | 'nominal' | 'ordinal' | 'temporal';
+        analyticType = 'measure' if \
+            kind in 'fcm' or (kind in 'iu' and len(s.value_counts()) > 16) \
+                else 'dimension'
+        import json
+        fname = fname_decode(s.name)
+        fname = json.dumps(fname, ensure_ascii=False)[1:-1]
+        return {
+            'fid': s.name, # f'col-{i}-{s.name}' if i is not None else s.name,
+            'name': fname,
+            'semanticType': semanticType,
+            'analyticType': analyticType
+        }
+    @classmethod
+    def to_records(cls,df: "pd.DataFrame"):
+        df = df.replace({float('nan'): None})
+        return df.to_dict(orient='records')
+
+    @classmethod
+    def raw_fields(cls,df: "pd.DataFrame"):
+        return [
+            cls.infer_prop(df[col], i)
+            for i, col in enumerate(df.columns)
+        ]
+
+    @classmethod
+    def get_props(cls,df: "pd.DataFrame", **kwargs):
+        df = df.reset_index()
+        df = df.rename(fname_encode, axis='columns')
+        props = {
+            'dataSource': cls.to_records(df),
+            'rawFields': cls.raw_fields(df),
+            'hideDataSourceConfig': kwargs.get('hideDataSourceConfig', True),
+            'fieldkeyGuard': False,
+            'themeKey': 'g2',
+            **kwargs,
+        }
+        return props
+
+class PolarsDataFramePropGetter:
+    @classmethod
+    def infer_prop(cls,s: "pl.Series", i=None) -> tp.Dict:
+        """get IMutField
+
+        Args:
+            s (pl.Series): the column
+            i (int, optional): column id. Defaults to None.
+
+        Returns:
+            tp.Dict: _description_
+        """
+        kind = s.dtype
+        # print(f'{s.name}: type={s.dtype}, kind={s.dtype.kind}')
+        v_cnt = len(s.value_counts())
+        semanticType = 'quantitative' if \
+            (kind in [pl.Int16,pl.Int32,pl.Int64,
+                             pl.UInt8,pl.UInt16,pl.UInt32,
+                             pl.UInt64,pl.Duration,pl.Float32,pl.Float64] and v_cnt > 16) \
+                else 'temporal' if kind in [pl.Datetime,pl.Date,pl.Time] \
+                    else 'nominal' if kind in [pl.Boolean,pl.Object,pl.Utf8,pl.Categorical,pl.Struct,pl.List] or v_cnt <= 2 \
+                        else 'ordinal'
+        # 'quantitative' | 'nominal' | 'ordinal' | 'temporal';
+        analyticType = 'measure' if \
+            kind in [pl.Float32,pl.Float64,pl.Duration] or (kind in [pl.Int16,pl.Int32,pl.Int64,
+                             pl.UInt8,pl.UInt16,pl.UInt32,
+                             pl.UInt64] and v_cnt > 16) \
+                else 'dimension'
+        import json
+        fname = fname_decode(s.name)
+        fname = json.dumps(fname, ensure_ascii=False)[1:-1]
+        return {
+            'fid': s.name, # f'col-{i}-{s.name}' if i is not None else s.name,
+            'name': fname,
+            'semanticType': semanticType,
+            'analyticType': analyticType
+        }
+
+    @classmethod
+    def to_records(cls,df: "pl.DataFrame"):
+        df = df.fill_nan(None)
+        return df.to_dicts()
+    # old style 
+    @classmethod
+    def raw_fields(cls,df: "pl.DataFrame"):
+        return [
+            cls.infer_prop(df[col], i)
+            for i, col in enumerate(df.columns)
+        ]
+    #new style using parallel
+    @classmethod
+    def raw_fields(cls,df: "pl.DataFrame"):
+        def colname2fname(s):
+            import json
+            fname = fname_decode(s)
+            fname = json.dumps(fname, ensure_ascii=False)[1:-1]
+            return fname
+        def colname2semanticType(kind,v_cnt):
+            semanticType = 'quantitative' if \
+                (kind in ["Int16","Int32","Int64",
+                                 "UInt8","UInt16","UInt32",
+                                 "UInt64","Duration","Float32","Float64"] and v_cnt > 16) \
+                    else 'temporal' if kind in ["Datetime","Date","Time"] \
+                        else 'nominal' if kind in ["Boolean","Object","Utf8","Categorical","Struct","List"] or v_cnt <= 2 \
+                            else 'ordinal'
+            return semanticType
+        def colname2analyticType(kind,v_cnt):
+            analyticType = 'measure' if \
+                kind in ["Float32","Float64","Duration"] or (kind in ["Int16","Int32","Int64",
+                                 "UInt8","UInt16","UInt32",
+                                 "UInt64"] and v_cnt > 16) \
+                else 'dimension'
+            return analyticType
+        def type2str(t):
+            for root_type in (pl.Int16,pl.Int32,pl.Int64,
+                             pl.UInt8,pl.UInt16,pl.UInt32,
+                             pl.UInt64,pl.Float32,pl.Float64,
+                             pl.Datetime,pl.Boolean,pl.Object,pl.Utf8,pl.Date,
+                             pl.Categorical,pl.Struct,pl.List,pl.Duration):
+                if t == root_type:
+                    return str(root_type)
+            return str(t)
+        counts_data = df.select([pl.col("*").value_counts().count()]).row(0)
+
+        col_info = pl.DataFrame({"fid":df.schema.keys(),
+                                # directly storing type instance and appling a function will lead to segmentfault
+                                 "kind":map(type2str,df.schema.values()), 
+                                 "v_cnt":counts_data},)
+        result_data = col_info.select([
+            pl.col("fid"),
+            pl.col("fid").apply(colname2fname).alias("name"),
+            pl.struct(["kind","v_cnt"]).apply(lambda d:colname2semanticType(**d)).alias("semanticType"),
+            pl.struct(["kind","v_cnt"]).apply(lambda d:colname2analyticType(**d)).alias("analyticType")
+        ])
+        return result_data.to_dicts()
+
+    @classmethod
+    def get_props(cls,df: "pl.DataFrame", **kwargs):
+        df = df.rename({i : fname_encode(i) for i in df.columns})
+        props = {
+            'dataSource': cls.to_records(df),
+            'rawFields': cls.raw_fields(df),
+            'hideDataSourceConfig': kwargs.get('hideDataSourceConfig', True),
+            'fieldkeyGuard': False,
+            'themeKey': 'g2',
+            **kwargs,
+        }
+        return props
+
+class OtherPropGetter:
+    @classmethod
+    def get_props(cls,df: "Any", **kwargs):
+        return {}
+
+__classname2method = {}
+def get_props(df: "pl.DataFrame | pd.DataFrame" , **kwargs):
+    df_type = type(df)
+    props = __classname2method.get(df_type,OtherPropGetter).get_props(df,**kwargs)
+    return props
+
+
+for module_name in ("pandas","polars"):
+    try:
+        exec("{0} = __import__('{0}')".format(module_name))
+        if module_name == "polars":
+            import polars as pl
+            __classname2method[pl.DataFrame] = PolarsDataFramePropGetter
+        elif module_name == "pandas":
+            import pandas as pd
+            __classname2method[pd.DataFrame] = PandasDataFramePropGetter
+    except ImportError:
+        continue
diff --git a/pygwalker/utils/render.py b/pygwalker/utils/render.py
@@ -1,4 +1,5 @@
 from ..base import *
+import datetime
 
 def gwalker_script():
     global gwalker_js
@@ -15,7 +16,7 @@ def gwalker_script():
 
 class DataFrameEncoder(json.JSONEncoder):
     def default(self, obj):
-        if isinstance(obj, pd.Timestamp):
+        if isinstance(obj, (datetime.datetime,datetime.date,datetime.time)):
             return str(obj)
         return json.JSONEncoder.default(self, obj)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,9 +14,14 @@ classifiers = [
 [tool.poetry.dependencies]
 python = "^3.5"
 jinja2 = "*"
-pandas = "*"
 ipython = "*"
+pandas = { version = "*", optional = true }
+polars = { version = "*", optional = true }
 
+[tool.poetry.extras]
+pandas = ["pandas"]
+polars = ["polars"]
+all = ["pandas", "polars"]
 
 [tool.poetry.group.dev.dependencies]
 build = "*"