In [1]:
print("hello world")

hello world


# Selecting Random samples, Duplicate Data, Query method()

### Random samples

A random selection of rows or columns from a Series or DataFrame with the sample() method. The method will sample rows by default, and accepts a specific number of rows/columns to return, or a fraction of rows.


In [11]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series([0, 1, 2, 3, 4, 5])
s.sample()


2    2
dtype: int64

In [4]:
s.sample(n=3)


0    0
5    5
2    2
dtype: int64

In [6]:
s.sample(frac=0.5)


5    5
4    4
2    2
dtype: int64

#### By default, sample will return each row at most once, but one can also sample with replacement using the replace option:
#### Allow or disallow sampling of the same row more than once.


In [7]:
s = pd.Series([0, 1, 2, 3, 4, 5])
s.sample(n=6, replace=False)


2    2
3    3
4    4
5    5
1    1
0    0
dtype: int64

In [8]:
s.sample(n=6, replace=True)


0    0
0    0
4    4
4    4
1    1
0    0
dtype: int64

# The query() Method

DataFrame objects have a query() method that allows selection using an expression.

In [12]:
n = 10
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))
df

# pure python
df[(df['a'] < df['b']) & (df['b'] < df['c'])]

# query
df.query('(a < b) & (b < c)')


Unnamed: 0,a,b,c
1,0.494571,0.671018,0.860636
3,0.144347,0.508456,0.871434
4,0.158685,0.426295,0.579296


In [13]:
f = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))
df.index.name = 'a'
df
df.query('a < b and b < c')
#If instead you don’t want to or cannot name your index, you can use the name index in your query expression:
df = pd.DataFrame(np.random.randint(n, size=(n, 2)), columns=list('bc'))
df
df.query('index < b < c')


Unnamed: 0,b,c
3,4,7


# MultiIndex query() Syntax

You can also use the levels of a DataFrame with a MultiIndex as if they were columns in the frame:

In [14]:
n = 10
colors = np.random.choice(['red', 'green'], size=n)
foods = np.random.choice(['eggs', 'ham'], size=n)
index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])
df = pd.DataFrame(np.random.randn(n, 2), index=index)
df
df.query('color == "red"')


Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
color,food,Unnamed: 2_level_1,Unnamed: 3_level_1
red,eggs,-0.819383,1.569453
red,eggs,-0.426495,0.105737
red,ham,-0.509202,0.046725


# The in and not in operators

In [15]:
# query() also supports special use of Python’s in and not in comparison operators, providing a succinct syntax for calling the isin method of a Series or DataFrame.
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
                   'c': np.random.randint(5, size=12),
                   'd': np.random.randint(9, size=12)})
df
df.query('a in b')
# How you'd do it in pure Python
df[df['a'].isin(df['b'])]
df.query('a not in b')
# pure Python
df[~df['a'].isin(df['b'])]

Unnamed: 0,a,b,c,d
6,d,b,2,2
7,d,b,4,2
8,e,c,0,6
9,e,c,0,6
10,f,c,0,3
11,f,c,3,4


In [16]:
# You can combine this with other expressions for very succinct queries:
# rows where cols a and b have overlapping values
# and col c's values are less than col d's
df.query('a in b and c < d')
# pure Python
df[df['b'].isin(df['a']) & (df['c'] < df['d'])]


Unnamed: 0,a,b,c,d
0,a,a,1,6
2,b,a,0,8
3,b,a,2,5
8,e,c,0,6
9,e,c,0,6
10,f,c,0,3
11,f,c,3,4


# Special use of the == operator with list objects

In [17]:
# Comparing a list of values to a column using ==/!= works similarly to in/not in.
df.query('b == ["a", "b", "c"]')
# pure Python
df[df['b'].isin(["a", "b", "c"])]
df.query('c == [1, 2]')
df.query('c != [1, 2]')
# using in/not in
df.query('[1, 2] in c')
df.query('[1, 2] not in c')
# pure Python
df[df['c'].isin([1, 2])]


Unnamed: 0,a,b,c,d
0,a,a,1,6
1,a,a,2,0
3,b,a,2,5
4,c,b,2,2
6,d,b,2,2


# Duplicate data

If you want to identify and remove duplicate rows in a DataFrame, there are two methods that will help: duplicated and drop_duplicates. Each takes as an argument the columns to use to identify duplicated rows.

duplicated returns a boolean vector whose length is the number of rows, and which indicates whether a row is duplicated.

drop_duplicates removes duplicate rows.

By default, the first observed row of a duplicate set is considered unique, but each method has a keep parameter to specify targets to be kept.

keep='first' (default): mark / drop duplicates except for the first occurrence.

keep='last': mark / drop duplicates except for the last occurrence.

keep=False: mark / drop all duplicates.


In [18]:
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
                    'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
                    'c': np.random.randn(7)})
df2
df2.duplicated('a')
df2.duplicated('a', keep='last')
df2.duplicated('a', keep=False)



0     True
1     True
2     True
3     True
4     True
5    False
6    False
dtype: bool

In [19]:
# Also, you can pass a list of columns to identify duplications.
df2.duplicated(['a', 'b'])
df2.drop_duplicates(['a', 'b'])

# To drop duplicates by index value, use Index.duplicated then perform slicing. The same set of options are available for the keep parameter.
df3 = pd.DataFrame({'a': np.arange(6),
                    'b': np.random.randn(6)},
                   index=['a', 'a', 'b', 'c', 'b', 'a'])
df3
df3.index.duplicated()
df3[~df3.index.duplicated()]
df3[~df3.index.duplicated(keep='last')]
df3[~df3.index.duplicated(keep=False)]


Unnamed: 0,a,b
c,3,0.629188
