In [1]:
%load_ext autoreload
%autoreload 2

import meerkat as mk
import numpy as np
import pandas as pd
import os
import pickle
from itertools import product

import numpy as np
import numpy.testing as np_test
import pytest
import torch

from meerkat.columns.pandas_column import PandasSeriesColumn

class MockStrColumn:
    def __init__(self, use_visible_rows: bool = False, col_type: type = None):
        self.array = [f"row_{idx}" for idx in range(16)]
        self.col = col_type(self.array)

        if use_visible_rows:
            self.visible_rows = [0, 4, 6, 11]
            self.col.visible_rows = self.visible_rows
        else:
            self.visible_rows = np.arange(16)

In [4]:
x = pd.Series([1,2,3])
x.to_pickle("test.pd")

In [6]:
class Base:

    @classmethod
    def test(cls):
        print(cls)

class SubClass(Base):
    pass

SubClass().test()

<class '__main__.SubClass'>


In [8]:

col = mk.NumpyArrayColumn(["123a", "b", "c", "ddasdasda"])
col.data

array(['123a', 'b', 'c', 'ddasdasda'], dtype='<U9')

In [23]:
from meerkat.columns.pandas_column import CachedAccessor, _MeerkatStringMethods, _MeerkatCombinedDatetimelikeProperties, _MeerkatCategoricalAccessor
import numbers
class PandasSeriesColumn(
    mk.AbstractColumn,
    np.lib.mixins.NDArrayOperatorsMixin,
):
    def __init__(
        self,
        data = None,
        dtype: str = None,
        *args,
        **kwargs,
    ):
        if isinstance(data, pd.Series):
            data = data if dtype is None else data.astype(dtype)
        elif data is not None:
            data = pd.Series(data, dtype=dtype)
        super(PandasSeriesColumn, self).__init__(data=data, *args, **kwargs)
    def _repr_pandas_(self) -> pd.Series:
        return self.data

    def to_tensor(self) -> torch.Tensor:
        """Use `column.to_tensor()` instead of `torch.tensor(column)`, which is
        very slow."""
        # TODO (Sabri): understand why `torch.tensor(column)` is so slow
        return torch.tensor(self.data)

    def to_pandas(self) -> pd.Series:
        return self.data
    _HANDLED_TYPES = (np.ndarray, numbers.Number)

    str = CachedAccessor("str", _MeerkatStringMethods)
    dt = CachedAccessor("dt", _MeerkatCombinedDatetimelikeProperties)
    cat = CachedAccessor("cat", _MeerkatCategoricalAccessor)
    # plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
    # sparse = CachedAccessor("sparse", SparseAccessor)

    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
        out = kwargs.get("out", ())
        for x in inputs + out:
            # Only support operations with instances of _HANDLED_TYPES.
            # Use ArrayLike instead of type(self) for isinstance to
            # allow subclasses that don't override __array_ufunc__ to
            # handle ArrayLike objects.
            if not isinstance(x, self._HANDLED_TYPES + (PandasSeriesColumn,)):
                return NotImplemented

        # Defer to the implementation of the ufunc on unwrapped values.
        inputs = tuple(
            x.data if isinstance(x, PandasSeriesColumn) else x for x in inputs
        )
        if out:
            kwargs["out"] = tuple(
                x.data if isinstance(x, PandasSeriesColumn) else x for x in out
            )
        result = getattr(ufunc, method)(*inputs, **kwargs)

        if type(result) is tuple:
            # multiple return values
            return tuple(type(self)(x) for x in result)
        elif method == "at":
            # no return value
            return None
        else:
            # one return value
            return type(self)(result)

    def __getattr__(self, name):
        try:
            out = getattr(object.__getattribute__(self, "data"), name)
            if isinstance(out, Callable):
                return getattr_decorator(out)
            else:
                return out
        except AttributeError:
            raise AttributeError(
                f"'{self.__class__.__name__}' object has no attribute '{name}'"
            )


col = PandasSeriesColumn(["a", "b", "c", "d"])
import pickle 
buf = pickle.dumps(col)
new_col = pickle.loads(buf)
new_col

AttributeError: 'NoneType' object has no attribute 'to_frame'

PandasSeriesColumn(None)

In [5]:
col = mk.PandasSeriesColumn([1,2,3,4])
def func(x):
    out = x + 1
    return out

result = col.map(
    func, batch_size=2, is_batched_fn=False, output_type=PandasSeriesColumn
)
result

Unnamed: 0,(PandasSeriesColumn)
0,2
1,3
0,4
1,5


In [3]:
testbed = MockStrColumn(
    col_type=PandasSeriesColumn
)
col = testbed.col

upper_col = col.str.upper()
assert isinstance(upper_col, PandasSeriesColumn)
assert col[0] == f"ROW_{testbed.visible_rows[0]}"


AssertionError: 

In [6]:
col.values

array(['row_0', 'row_1', 'row_2', 'row_3', 'row_4', 'row_5', 'row_6',
       'row_7', 'row_8', 'row_9', 'row_10', 'row_11', 'row_12', 'row_13',
       'row_14', 'row_15'], dtype=object)

In [4]:
col = PandasSeriesColumn(["05/30/96", "10/20/23"])
dt_col = pd.to_datetime(col)

In [7]:
dt_col.dt.day.values

array([30, 20])

In [9]:
mk.NumpyArrayColumn.from_data(col)

Unnamed: 0,(NumpyArrayColumn)
0,a
1,b
2,c


In [12]:
import meerkat as mk
mk.ListColumn.from_data(np.arange(10))

Unnamed: 0,(NumpyArrayColumn)
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [13]:
mk.ListColumn.from_data(range(10))

Unnamed: 0,(NumpyArrayColumn)
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9
