In [1]:
# https://dplyr.tidyverse.org/reference/distinct.html
from datar.datasets import starwars
from datar.all import *

print(distinct.__doc__)

Select only unique/distinct rows from a data frame.

    The original API:
    https://dplyr.tidyverse.org/reference/distinct.html

    Args:
        _data: The dataframe
        *columns, **mutates: Optional variables to use when determining
            uniqueness.
        _keep_all: If TRUE, keep all variables in _data

    Returns:
        A dataframe without duplicated rows in _data
    


In [2]:
df = tibble(
  x=sample(range(10), 100, replace=True),
  y=sample(range(10), 100, replace=True)
)
nrow(df)

100

In [3]:
nrow(distinct(df))

58

In [4]:
df >> distinct(f.x, f.y) >> nrow()

58

In [5]:
df >> distinct(f.x)

Unnamed: 0,x
0,0
1,4
2,8
4,6
6,2
7,1
8,9
9,3
19,5
23,7


In [6]:
df >> distinct(f.y)

Unnamed: 0,y
0,0
1,8
2,4
3,9
4,1
5,5
7,7
8,6
9,3
19,2


In [7]:
df >> distinct(f.x, _keep_all=True)

Unnamed: 0,x,y
0,0,0
1,4,8
2,8,4
4,6,1
6,2,4
7,1,7
8,9,6
9,3,3
19,5,2
23,7,1


In [8]:
df >> distinct(f.y, _keep_all=True)

Unnamed: 0,x,y
0,0,0
1,4,8
2,8,4
3,8,9
4,6,1
5,8,5
7,1,7
8,9,6
9,3,3
19,5,2


In [9]:
df >> distinct(diff=abs(f.x-f.y))

Unnamed: 0,diff
0,0
1,4
3,1
4,5
5,3
6,2
7,6
12,7
17,8
49,9


In [10]:
starwars >> distinct(across(contains("color")))

Unnamed: 0,hair_color,skin_color,eye_color
0,blond,fair,blue
1,,gold,yellow
2,,"white, blue",red
3,none,white,yellow
4,brown,light,brown
...,...,...,...
79,none,pale,white
81,black,dark,dark
82,brown,light,hazel
84,none,none,black


In [12]:
df = tibble(
  g=[1, 1, 2, 2],
  x=[1, 1, 2, 1]
) >> group_by(f.g)

df >> distinct(f.x) >> display()

[2021-04-02 23:59:01][datar][   INFO] # [DataFrameGroupBy] Groups: ['g'] (2)


Unnamed: 0,g,x
0,1,1
1,2,2
2,2,1
