In [2]:
import pandas as pd
import numpy as np

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ REINDEX @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

obj = pd.Series([4.5, 7.2, -5.3,  3.6], index = ['d', 'b', 'a', 'c'])
"""
d    4.5
b    7.2
a   -5.3
c    3.6"""

obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) # copy of object
"""
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN"""

obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
"""
0      blue
2    purple
4    yellow"""

obj3.reindex(np.arange(6), method='ffill')
"""
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow"""

#@@@@ reindex DATA FRAME @@@@

frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    columns=['Ohio', 'Texas', 'California'],
                    index=['a', 'c', 'd'])
"""Ohio 	Texas 	California
a 	0 	1 	2
c 	3 	4 	5
d 	6 	7 	8"""

# reindex rows in DF
frame2 = frame.reindex(index = ['a', 'b', 'c', 'd'])
""" Ohio 	Texas 	California
a 	0.0 	1.0 	2.0
b 	NaN 	NaN 	NaN
c 	3.0 	4.0 	5.0
d 	6.0 	7.0 	8.0"""

# reindex columns in DF (column 'Ohio' deleted, 'Utah' - added)
# method 1
states = ['Texas', 'Utah', 'California']
frame.reindex(columns = states)

# method 2 (column 'Ohio' deleted, 'Utah' - added)
frame.reindex(states, axis = 'columns')
"""Texas   Utah   California
a 	1 	   NaN 	   2
c 	4 	   NaN 	   5
d 	7 	   NaN 	   8"""

frame
""" 	
  Ohio 	Texas 	California
a 	0 	1 	2
c 	3 	4 	5
d 	6 	7 	8"""

# reindex with 'loc' ----> all fields must be in source (frame column 'Ohio' deleted)
frame.loc[['a', 'd', 'c'], ['California', 'Texas']]
"""
    California 	Texas
a 	2 	1
d 	8 	7
c 	5 	4"""

#@@@@@@@ DELETING ELEMENTS OF AXIS @@@@@@@@@@@

obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
"""
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0"""

new_obj = obj.drop('c')
new_obj
"""
a    0.0
b    1.0
d    3.0
e    4.0"""

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                   index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])
data
"""
         one 	two 	three 	four
Ohio 	 0 	      1 	  2 	3
Colorado 4 	      5 	  6 	7
Utah 	 8 	      9 	  10 	11
New York 12 	  13      14 	15"""

# delete row
data.drop(index = ['Colorado', 'Utah'])
""" 	one 	two 	three 	four
Ohio 	0 	1 	2 	3
New York 	12 	13 	14 	15"""

# delete column var.1
data.drop(columns = 'two')
"""  one 	three 	four
Ohio 	0 	2 	3
Colorado 	4 	6 	7
Utah 	8 	10 	11
New York 	12 	14 	15"""

# delete column var.2
data.drop('two', axis = 1)

# delete column var.3
data.drop(['two', 'four'], axis = 'columns')
"""       one 	three
Ohio 	    0 	2
Colorado 	4 	6
Utah 	    8 	10
New York 	12 	14"""

#@@@@@@@@@@@@@@@@@@@@@@@ INDEX ACCESS @@@@@@@@@@@@

obj = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
"""
a    0.0
b    1.0
c    2.0
d    3.0"""

obj['b'] # 1.0
#obj[1]  # 1.0
obj[2:4] # cutting out 
"""
c    2.0
d    3.0"""

obj[['b', 'a', 'd']]
"""
b    1.0
a    0.0
d    3.0"""

#obj[[1, 3]]
"""
b    1.0
d    3.0"""

obj[obj < 2]
"""
a    0.0
b    1.0"""

# preferably like this for mark-indexing
obj.loc[['b', 'a', 'c']]
"""
b    1.0
a    0.0
c    2.0"""

obj1 = pd.Series([1, 2, 3], index = [2, 0, 1])
"""
2    1
0    2
1    3"""
obj2 = pd.Series([1, 2, 3], index = ['a', 'b', 'c'])
"""
a    1
b    2
c    3"""

obj1[[0, 1, 2]]
"""
0    2
1    3
2    1"""

# obj2[[0, 1, 2]]
"""
a    1
b    2
c    3"""

# but here ERROR, because index not an integer ---> 'loc' working only with marks
# obj2.loc[[0, 1, 2]] # ERROR !!!

obj2.iloc[[0, 1, 2]] # work !
"""
a    1
b    2
c    3"""

# obj2.iloc[['b', 'a']] # ERROR!

# !!! cutting out with labels ---> including end border as opposed to Python (cutting with integers)
obj2.loc['b':'c']
"""
b    2
c    3"""

# modify through cutting (view)
obj2.loc['b':'c'] = 2
obj2
"""
a    1
b    2
c    2"""

#@@@@@@@@@ 

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                   index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])
data
"""
        one 	two 	three 	four
Ohio 	 0 	     1  	2 	    3
Colorado 4 	     5  	6 	    7
Utah 	 8 	     9 	    10 	    11
New York 12 	13 	    14 	    15"""

data['two'] # Series object
"""
Ohio         1
Colorado     5
Utah         9
New York    13"""

data[['three', 'one']]
"""
 	    three 	one
Ohio 	 2 	    0
Colorado 6 	    4
Utah 	 10 	8
New York 14 	12"""

# cutting out ROWS
data[:2]
"""
 	       one 	two 	three 	four
Ohio 	    0 	1 	      2 	3
Colorado 	4 	5 	      6 	7"""

# cutting out ROWS by condition
cond = data['three'] > 5
"""
Ohio        False
Colorado     True
Utah         True
New York     True"""

data[cond]
"""
 	        one 	two 	three 	four
Colorado 	4 	     5 	      6 	7
Utah 	    8 	     9 	      10 	11
New York 	12 	     13 	  14 	15"""

# select COLUMNS and ROWS
data[['one', 'two']][1:]
"""
 	       one 	two
Colorado 	4 	5
Utah 	    8 	9
New York 	12 	13"""

# bool DataFrame
data < 5
"""
 	     one 	two 	three 	four
Ohio 	 True 	True 	True 	True
Colorado True 	False 	False 	False
Utah 	 False 	False 	False 	False
New York False 	False 	False 	False"""

# modify DF through bool DF
data[data < 5] = 0
data
"""
 	     one 	two 	three 	four
Ohio 	 0 	    0 	    0 	    0
Colorado 0 	    5 	    6 	    7
Utah 	 8 	    9     	10 	    11
New York 12 	13 	    14 	    15"""

#@@@@@@@@@ SELECTION FROM DATA FRAME WITH LOC / ILOC @@@@@@@@@@@@@@@@

data
"""
        one 	two 	three 	four
Ohio 	 0 	     1  	2 	    3
Colorado 4 	     5  	6 	    7
Utah 	 8 	     9 	    10 	    11
New York 12 	13 	    14 	    15"""

# Series object from DF
data.loc['Colorado'] 
"""
one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64"""

# new DF-object from DF
data.loc[['Colorado', 'Utah']]
""" 	    one 	two 	three 	four
Colorado 	0 	     5 	      6 	7
Utah 	    8 	     9 	      10 	11"""

# simultaneous sampling rows and columns
data.loc[['Colorado', 'Ohio'], ['two', 'three']]
"""
 	        two 	three
Colorado 	5 	     6
Ohio 	    0 	     0"""

# sampling by integer indexes
data.iloc[2] #          ---> Series, 1 full rows
"""
one       8
two       9
three    10
four     11
Name: Utah, dtype: int64"""

data.iloc[[2, 3]] #      ---> DataFrame, 2 full rows
"""
 	       one 	two 	three 	four
Utah 	    8 	 9 	     10 	11
New York 	12 	13 	     14 	15"""

data.iloc[2, [3, 0, 1]] # ---> Series, 2 row, 3 columns
"""
four    11
one      8
two      9
Name: Utah, dtype: int64"""

data.iloc[[2, 3], [3, 0, 1]]
"""
 	       four 	one 	two
Utah 	    11 	     8 	     9
New York 	15 	     12 	13"""

# slice
data.loc[:'Utah', ['one', 'four']]
"""
 	    one 	four
Ohio 	 0 	     0
Colorado 0 	     7
Utah 	 8 	     11"""

# all rows and < 3 columns ---> then rows where data.three > 5
data.iloc[:, :3][data.three > 5]
"""
 	       one 	two 	three
Colorado 	0 	 5 	     6
Utah 	    8 	 9 	     10
New York 	12 	13 	     14"""

#@@@@@@@@@@@@@@  PITFALLS OF INTEGER INDEXING @@@@@@@@@@@@@@

ser = pd.Series(np.arange(3.))
"""
0    0.0
1    1.0
2    2.0"""

# ser[-1] # ERROR! because unclear: we want by index or by mark
ser.iloc[-1] # 2.0

ser2 = pd.Series(np.arange(3.), index = ['a', 'b', 'c'])
"""
a    0.0
b    1.0
c    2.0"""

#ser2[-1] # 2.0
ser[:2]
"""
0    0.0
1    1.0"""

# TOTAL: for indexing use LOC/ILOC

#@@@@@@@@@@@@@ PITFALLS OF CHAIN INDEXING @@@@@@@@@@@@@@@@@@@@@

data
"""
 	     one 	two 	three 	four
Ohio 	 0 	    0 	    0 	    0
Colorado 0 	    5 	    6 	    7
Utah 	 8 	    9     	10 	    11
New York 12 	13 	    14 	    15"""

data.loc[:, 'one'] = 1
data
"""
 	    one two three four
Ohio 	 1 	0 	0 	0
Colorado 1 	5 	6 	7
Utah 	 1 	9 	10 	11
New York 1 	13 	14 	15"""



'\n \t    one two three four\nOhio \t 1 \t0 \t0 \t0\nColorado 1 \t5 \t6 \t7\nUtah \t 1 \t9 \t10 \t11\nNew York 1 \t13 \t14 \t15'

In [6]:
import pandas as pd
import numpy as np

#@@@@@@@@@@@@@@@@@ OPERATION BETWEEN DATAFRAME AND SERIES @@@@@@@@@@@@@@@@@@@

arr = np.arange(12.).reshape((3, 4))
"""array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])"""

arr[0] # array([0., 1., 2., 3.])

arr - arr[0]
"""
array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])"""


frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                    columns = list('bde'),
                    index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
"""
 	     b 	     d  	e
Utah 	0.0 	1.0 	2.0
Ohio 	3.0 	4.0 	5.0
Texas 	6.0 	7.0 	8.0
Oregon 	9.0 	10.0 	11.0"""

series = frame.iloc[0]
"""
b    0.0
d    1.0
e    2.0"""

# if we want to stack ROWS and compare COLUMNS
frame - series
""" 	b 	     d 	     e
Utah 	0.0 	0.0 	0.0
Ohio 	3.0 	3.0 	3.0
Texas 	6.0 	6.0 	6.0
Oregon 	9.0 	9.0 	9.0"""

series2 = pd.Series(np.arange(3), index = ['b', 'e', 'f'])
# reindex result
frame + series2
"""

	       b 	d 	e 	f
Utah 	0.0 	NaN 	3.0 	NaN
Ohio 	3.0 	NaN 	6.0 	NaN
Texas 	6.0 	NaN 	9.0 	NaN
Oregon 	9.0 	NaN 	12.0 	NaN"""

# if we want to stack COLUMNS and compare ROWS
series3 = frame['b']
"""
Utah      0.0
Ohio      3.0
Texas     6.0
Oregon    9.0"""

frame.sub(series3, axis = 'index')
"""
 	    b 	    d 	    e
Utah 	0.0 	1.0 	2.0
Ohio 	0.0 	1.0 	2.0
Texas 	0.0 	1.0 	2.0
Oregon 	0.0 	1.0 	2.0"""


#@@@@@@@@@@@@@@@@@@ FUNCTIONS AND DISPLAY @@@@@@@@@@@@@@@@@@

rng = np.random.default_rng(seed=12345)

frame = pd.DataFrame(rng.standard_normal((4, 3)),
                    columns = list('bde'),
                    index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
"""
          b 	     d 	          e
Utah 	-1.423825 	1.263728 	-0.870662
Ohio 	-0.259173 	-0.075343 	-0.740885
Texas 	-1.367793 	0.648893 	0.361058
Oregon 	-1.952863 	2.347410 	0.968497"""

np.abs(frame) # all working !

# using function for COLUMNS
def f1(x):
    return x.max() - x.min()

frame.apply(f1)
"""
b    1.693690
d    2.422753
e    1.839159"""

# using function for ROWS (!)
frame.apply(f1, axis = 'columns')
"""
Utah      2.687553
Ohio      0.665541
Texas     2.016686
Oregon    4.300273"""

# return Series-object
def f2(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

frame.apply(f2)
"""
          b 	     d 	         e
min 	-1.952863 	-0.075343 	-0.870662
max 	-0.259173 	2.347410 	0.968497"""

def my_format(x):
    return f"{x:.2f}"

# map() is a standart Python method for DataFrame
frame.map(my_format)
"""
         b 	    d 	     e
Utah 	-1.42 	1.26 	-0.87
Ohio 	-0.26 	-0.08 	-0.74
Texas 	-1.37 	0.65 	0.36
Oregon 	-1.95 	2.35 	0.97"""

# map() for Series
frame['e'].map(my_format)
"""
Utah      -0.87
Ohio      -0.74
Texas      0.36
Oregon     0.97"""


#@@@@@@@@@@@@@@@@@@@@@@@@@@ SORTING @@@@@@@@@@@@@@@@@@@@

obj = pd.Series(np.arange(4), index = ['d', 'a', 'b', 'c'])
obj.sort_index()

frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                    index = ['three', 'one'],
                    columns = ['d', 'a', 'c', 'b'])
"""
 	    d 	a 	c 	b
three 	0 	1 	2 	3
one 	4 	5 	6 	7"""

frame.sort_index() # by ROWS (by default)
"""
       d 	a 	c 	b
one 	4 	5 	6 	7
three 	0 	1 	2 	3"""

frame.sort_index(axis = "columns") # by COLUMNS

frame.sort_index(axis = "columns", ascending = False) # sort by decending

# sorting by values
obj = pd.Series([4, 7, -3, 2])
"""
0    4
1    7
2   -3
3    2"""

obj.sort_values() 
"""
2   -3
3    2
0    4
1    7"""

# NaN in the end (by default)
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()
"""
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN"""

# NaN in the begining
obj.sort_values(na_position = "first")
"""
1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0"""

frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
"""
    b 	a
0 	4 	0
1 	7 	1
2 	-3 	0
3 	2 	1"""

frame.sort_values('b')
"""
    b 	a
2 	-3 	0
3 	2 	1
0 	4 	0
1 	7 	1"""

# sort by several columns
frame.sort_values(['a', 'b'])
"""
    b 	a
2 	-3 	0
0 	4 	0
3 	2 	1
1 	7 	1"""

#@@@ RANK() @@@@

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
"""
0    7
1   -5
2    7
3    4
4    2
5    0
6    4"""

obj.rank()
"""
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5"""

obj.rank(method = 'first')
"""
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0"""

obj.rank(ascending = False)
"""
0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5"""

frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],  'c': [-2, 5, 8, -2.5]})
"""
    b 	    a 	 c
0 	4.3 	0 	-2.0
1 	7.0 	1 	 5.0
2 	-3.0 	0 	 8.0
3 	2.0 	1 	-2.5"""

# rank by ROWS (!)
frame.rank(axis = 'columns')

# for undestanding
obj.sort_values()
"""
1   -5
5    0
4    2
3    4
6    4
0    7
2    7"""

obj.sort_values().rank()
"""
1    1.0
5    2.0
4    3.0
3    4.5
6    4.5
0    6.5
2    6.5"""

obj.sort_values().rank(method = 'dense')
"""
1    1.0
5    2.0
4    3.0
3    4.0
6    4.0
0    6.0
2    6.0"""

#@@@@@@@@@@@ INDEXS BY AXIS WITH REPETABLE VALUES @@@@@@@@@@@

obj = pd.Series(np.arange(5), index = ['a', 'a', 'b', 'b', 'c'])
obj.index.is_unique # False

obj['a'] # Series
"""
a    0
a    1"""

obj['c'] # scalar

df = pd.DataFrame(np.random.standard_normal((5, 3)),
                 index = ['a', 'a', 'b', 'b', 'c'])

df.loc['b']
"""
     0 	        1 	         2
b 	-0.335421 	0.666714 	-1.947580
b 	2.188999 	0.409918 	1.038869"""

df.loc['c'] # Series-object
"""
0   -0.191999
1    0.974275
2    0.993109"""


0   -0.191999
1    0.974275
2    0.993109
Name: c, dtype: float64

In [44]:
import numpy as np
import pandas as pd

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 5.3. REDUCTION AND CALCULATION OF DESCRIPTIVE STATISTICS @@@@@@@@@@@@@@@@@

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
                 index = ['a', 'b', 'c', 'd'], 
                 columns = ['one', 'two'])
"""
 	one 	two
a 	1.40 	NaN
b 	7.10 	-4.5
c 	NaN 	NaN
d 	0.75 	-1.3"""

# NaN = 0
df.sum() # Series-object
"""
one    9.25
two   -5.80"""

# sum() by ROWS
df.sum(axis = "columns")
"""
a    1.40
b    2.60
c    0.00
d   -0.55"""

# sum() by ROWS (also)
df.sum(axis = 1)

# turn off NaN = 0
df.sum(axis = "index", skipna = False)
"""
one   NaN
two   NaN"""

df.sum(axis = "columns", skipna = False)
"""
a     NaN
b    2.60
c     NaN
d   -0.55"""

df.mean(axis = "columns")
"""
a    1.400
b    1.300
c      NaN
d   -0.275"""

# return index, satisfying the condition
df.idxmax() # idxmin()
"""
one    b
two    d"""

# cummulative by ROWS
df.cumsum(axis = 1)
"""
 	one 	two
a 	1.40 	NaN
b 	7.10 	2.60
c 	NaN 	NaN
d 	0.75 	-0.55"""

# cummulative by COLUMNS
df.cumsum()
"""
 	one 	two
a 	1.40 	NaN
b 	8.50 	-4.5
c 	NaN 	NaN
d 	9.25 	-5.8"""

df.describe()
"""
           one 	     two
count 	3.000000 	2.000000
mean 	3.083333 	-2.900000
std 	3.493685 	2.262742
min 	0.750000 	-4.500000
25% 	1.075000 	-3.700000
50% 	1.400000 	-2.900000
75% 	4.250000 	-2.100000
max 	7.100000 	-1.300000"""

obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
"""
0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c"""

obj.describe()
"""
count     16
unique     3
top        a
freq       8"""

#@@@@@@@@@@@@@@@@@@@ KORRELATION AND KOVARIATION @@@@@@@@@@@@@@@@

price = pd.read_pickle("/home/vk/Python_Source/pydata-book/examples/yahoo_price.pkl")
volume = pd.read_pickle("/home/vk/Python_Source/pydata-book/examples/yahoo_volume.pkl")

price
"""
               AAPL 	GOOG 	       IBM 	    MSFT
Date 				
2010-01-04 	27.990226 	313.062468 	113.304536 	25.884104
2010-01-05 	28.038618 	311.683844 	111.935822 	25.892466
2010-01-06 	27.592626 	303.826685 	111.208683 	25.733566
2010-01-07 	27.541619 	296.753749 	110.823732 	25.465944
2010-01-08 	27.724725 	300.709808 	111.935822 	25.641571
... 	... 	... 	... 	...
2016-10-17 	117.550003 	779.960022 	154.770004 	57.220001
2016-10-18 	117.470001 	795.260010 	150.720001 	57.660000
2016-10-19 	117.120003 	801.500000 	151.259995 	57.529999
2016-10-20 	117.059998 	796.969971 	151.520004 	57.250000
2016-10-21 	116.599998 	799.369995 	149.630005 	59.660000"""

results = price.pct_change()
results.tail()
"""
                AAPL 	GOOG 	       IBM 	      MSFT
Date 				
2016-10-17 	-0.000680 	0.001837 	0.002072 	-0.003483
2016-10-18 	-0.000681 	0.019616 	-0.026168 	0.007690
2016-10-19 	-0.002979 	0.007846 	0.003583 	-0.002255
2016-10-20 	-0.000512 	-0.005652 	0.001719 	-0.004867
2016-10-21 	-0.003930 	0.003011 	-0.012474 	0.042096"""

results['MSFT']
"""
2016-10-17   -0.003483
2016-10-18    0.007690
2016-10-19   -0.002255
2016-10-20   -0.004867
2016-10-21    0.042096"""

results['MSFT'].corr(results['IBM'])  # 0.4997636114415108

results.corr()
"""
 	       AAPL 	GOOG 	IBM 	MSFT
AAPL 	1.000000 	0.407919 	0.386817 	0.389695
GOOG 	0.407919 	1.000000 	0.405099 	0.465919
IBM 	0.386817 	0.405099 	1.000000 	0.499764
MSFT 	0.389695 	0.465919 	0.499764 	1.000000"""

results.cov()
"""
 	       AAPL 	GOOG 	IBM 	MSFT
AAPL 	0.000277 	0.000107 	0.000078 	0.000095
GOOG 	0.000107 	0.000251 	0.000078 	0.000108
IBM 	0.000078 	0.000078 	0.000146 	0.000089
MSFT 	0.000095 	0.000108 	0.000089 	0.000215"""

# correlation each column with series-object (column 'IBM')
results.corrwith(results['IBM'])
"""
AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764"""

# by columns
results.corrwith(volume)

# by rows
results.corrwith(volume, axis = 1)
"""
Date
2010-01-04         NaN
2010-01-05    0.737298
2010-01-06    0.017069
2010-01-07    0.507614
2010-01-08   -0.779646
                ...   
2016-10-17   -0.881606
2016-10-18   -0.303369
2016-10-19   -0.970723
2016-10-20   -0.304414
2016-10-21    0.927824"""

#@@@@@@@@@@@@@@@@@@@ UNIQUE VALUES, VALUE COUNTERS AND MEMBERSHIP @@@@@@@@@@@

obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques # array(['c', 'a', 'd', 'b'], dtype=object)

# count of elements, sort by value (desc)
obj.value_counts()
"""
c    3
a    3
b    2
d    1"""

# pd.value_counts(obj.to_numpy(), sort = False) # depricated
pd.Series(obj).value_counts(sort = False)
"""
c    3
a    3
d    1
b    2"""

# FILTER WITH 'isin'
obj
"""
0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c"""

mask = obj.isin(['b', 'c'])
mask
"""
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True"""

obj[mask]
"""
0    c
5    b
6    b
7    c
8    c"""

#@@
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])

pd.Index(unique_vals) # Index(['c', 'b', 'a'], dtype='object')
# c=0, b = 1, a = 2 -- > so get indexes from unique in not-unique sequence
pd.Index(unique_vals).get_indexer(to_match) # array([0, 2, 1, 1, 0, 2])

#@@ COUNTERS
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data
"""
   Qu1 	Qu2 Qu3
0 	1 	2 	1
1 	3 	3 	5
2 	4 	1 	2
3 	3 	2 	4
4 	4 	3 	4"""

data['Qu1'].value_counts().sort_index() # pair: value - count
"""
Qu1
1    1
3    2
4    2"""

data.apply(pd.value_counts).fillna(0) # pair: value - count by each column
"""
 	Qu1 	Qu2 	Qu3
1 	1.0 	1.0 	1.0
2 	0.0 	2.0 	1.0
3 	2.0 	2.0 	0.0
4 	2.0 	0.0 	2.0
5 	0.0 	0.0 	1.0"""

data = pd.DataFrame({'a': [1, 1, 1, 2, 2], 'b': [0, 0, 1, 0, 0]})
data
"""
    a 	b
0 	1 	0
1 	1 	0
2 	1 	1
3 	2 	0
4 	2 	0"""

data.value_counts()

  data.apply(pd.value_counts).fillna(0) # pair: value - count by each column


a  b
1  0    2
2  0    2
1  1    1
Name: count, dtype: int64