In [1]:
# https://dplyr.tidyverse.org/reference/distinct.html
from datar.datasets import starwars
from datar.all import *

from nb_helpers import nb_header
nb_header(distinct)

### # distinct  

##### Select only unique/distinct rows from a data frame.

The original API:  
https://dplyr.tidyverse.org/reference/distinct.html  

##### Args:
&emsp;&emsp;`_data`: The dataframe  
&emsp;&emsp;`*columns`: and  
&emsp;&emsp;`**mutates`: Optional variables to use when determining  
&emsp;&emsp;&emsp;&emsp;uniqueness.  

&emsp;&emsp;`_keep_all`: If TRUE, keep all variables in _data  

##### Returns:
&emsp;&emsp;A dataframe without duplicated rows in _data  


In [2]:
df = tibble(
  x=sample(range(10), 100, replace=True),
  y=sample(range(10), 100, replace=True)
)
nrow(df)

100

In [3]:
nrow(distinct(df))

65

In [4]:
df >> distinct(f.x, f.y) >> nrow()

65

In [5]:
df >> distinct(f.x)

Unnamed: 0,x
0,4
1,3
4,5
5,1
8,9
9,2
10,7
11,0
12,8
43,6


In [6]:
df >> distinct(f.y)

Unnamed: 0,y
0,4
1,3
2,1
3,5
4,2
8,0
12,7
14,8
18,9
30,6


In [7]:
df >> distinct(f.x, _keep_all=True)

Unnamed: 0,x,y
0,4,4
1,3,3
4,5,2
5,1,3
8,9,0
9,2,1
10,7,1
11,0,4
12,8,7
43,6,8


In [8]:
df >> distinct(f.y, _keep_all=True)

Unnamed: 0,x,y
0,4,4
1,3,3
2,4,1
3,4,5
4,5,2
8,9,0
12,8,7
14,2,8
18,0,9
30,0,6


In [9]:
df >> distinct(diff=abs(f.x-f.y))

Unnamed: 0,diff
0,0
2,3
3,1
5,2
8,9
10,6
11,4
20,8
25,5
71,7


In [10]:
starwars >> distinct(across(contains("color")))

Unnamed: 0,hair_color,skin_color,eye_color
0,blond,fair,blue
1,,gold,yellow
2,,"white, blue",red
3,none,white,yellow
4,brown,light,brown
...,...,...,...
79,none,pale,white
81,black,dark,dark
82,brown,light,hazel
84,none,none,black


In [11]:
df = tibble(
  g=[1, 1, 2, 2],
  x=[1, 1, 2, 1]
) >> group_by(f.g)

df >> distinct(f.x) >> display()

[2021-04-08 15:57:13][datar][   INFO] # [DataFrameGroupBy] Groups: ['g'] (2)


Unnamed: 0,g,x
0,1,1
1,2,2
2,2,1
