In [2]:
import pandas as pd
import numpy as np

#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ REINDEX @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

obj = pd.Series([4.5, 7.2, -5.3,  3.6], index = ['d', 'b', 'a', 'c'])
"""
d    4.5
b    7.2
a   -5.3
c    3.6"""

obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) # copy of object
"""
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN"""

obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
"""
0      blue
2    purple
4    yellow"""

obj3.reindex(np.arange(6), method='ffill')
"""
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow"""

#@@@@ reindex DATA FRAME @@@@

frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    columns=['Ohio', 'Texas', 'California'],
                    index=['a', 'c', 'd'])
"""Ohio 	Texas 	California
a 	0 	1 	2
c 	3 	4 	5
d 	6 	7 	8"""

# reindex rows in DF
frame2 = frame.reindex(index = ['a', 'b', 'c', 'd'])
""" Ohio 	Texas 	California
a 	0.0 	1.0 	2.0
b 	NaN 	NaN 	NaN
c 	3.0 	4.0 	5.0
d 	6.0 	7.0 	8.0"""

# reindex columns in DF (column 'Ohio' deleted, 'Utah' - added)
# method 1
states = ['Texas', 'Utah', 'California']
frame.reindex(columns = states)

# method 2 (column 'Ohio' deleted, 'Utah' - added)
frame.reindex(states, axis = 'columns')
"""Texas   Utah   California
a 	1 	   NaN 	   2
c 	4 	   NaN 	   5
d 	7 	   NaN 	   8"""

frame
""" 	
  Ohio 	Texas 	California
a 	0 	1 	2
c 	3 	4 	5
d 	6 	7 	8"""

# reindex with 'loc' ----> all fields must be in source (frame column 'Ohio' deleted)
frame.loc[['a', 'd', 'c'], ['California', 'Texas']]
"""
    California 	Texas
a 	2 	1
d 	8 	7
c 	5 	4"""

#@@@@@@@ DELETING ELEMENTS OF AXIS @@@@@@@@@@@

obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
"""
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0"""

new_obj = obj.drop('c')
new_obj
"""
a    0.0
b    1.0
d    3.0
e    4.0"""

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                   index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])
data
"""
         one 	two 	three 	four
Ohio 	 0 	      1 	  2 	3
Colorado 4 	      5 	  6 	7
Utah 	 8 	      9 	  10 	11
New York 12 	  13      14 	15"""

# delete row
data.drop(index = ['Colorado', 'Utah'])
""" 	one 	two 	three 	four
Ohio 	0 	1 	2 	3
New York 	12 	13 	14 	15"""

# delete column var.1
data.drop(columns = 'two')
"""  one 	three 	four
Ohio 	0 	2 	3
Colorado 	4 	6 	7
Utah 	8 	10 	11
New York 	12 	14 	15"""

# delete column var.2
data.drop('two', axis = 1)

# delete column var.3
data.drop(['two', 'four'], axis = 'columns')
"""       one 	three
Ohio 	    0 	2
Colorado 	4 	6
Utah 	    8 	10
New York 	12 	14"""

#@@@@@@@@@@@@@@@@@@@@@@@ INDEX ACCESS @@@@@@@@@@@@

obj = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
"""
a    0.0
b    1.0
c    2.0
d    3.0"""

obj['b'] # 1.0
#obj[1]  # 1.0
obj[2:4] # cutting out 
"""
c    2.0
d    3.0"""

obj[['b', 'a', 'd']]
"""
b    1.0
a    0.0
d    3.0"""

#obj[[1, 3]]
"""
b    1.0
d    3.0"""

obj[obj < 2]
"""
a    0.0
b    1.0"""

# preferably like this for mark-indexing
obj.loc[['b', 'a', 'c']]
"""
b    1.0
a    0.0
c    2.0"""

obj1 = pd.Series([1, 2, 3], index = [2, 0, 1])
"""
2    1
0    2
1    3"""
obj2 = pd.Series([1, 2, 3], index = ['a', 'b', 'c'])
"""
a    1
b    2
c    3"""

obj1[[0, 1, 2]]
"""
0    2
1    3
2    1"""

# obj2[[0, 1, 2]]
"""
a    1
b    2
c    3"""

# but here ERROR, because index not an integer ---> 'loc' working only with marks
# obj2.loc[[0, 1, 2]] # ERROR !!!

obj2.iloc[[0, 1, 2]] # work !
"""
a    1
b    2
c    3"""

# obj2.iloc[['b', 'a']] # ERROR!

# !!! cutting out with labels ---> including end border as opposed to Python (cutting with integers)
obj2.loc['b':'c']
"""
b    2
c    3"""

# modify through cutting (view)
obj2.loc['b':'c'] = 2
obj2
"""
a    1
b    2
c    2"""

#@@@@@@@@@ 

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                   index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])
data
"""
        one 	two 	three 	four
Ohio 	 0 	     1  	2 	    3
Colorado 4 	     5  	6 	    7
Utah 	 8 	     9 	    10 	    11
New York 12 	13 	    14 	    15"""

data['two'] # Series object
"""
Ohio         1
Colorado     5
Utah         9
New York    13"""

data[['three', 'one']]
"""
 	    three 	one
Ohio 	 2 	    0
Colorado 6 	    4
Utah 	 10 	8
New York 14 	12"""

# cutting out ROWS
data[:2]
"""
 	       one 	two 	three 	four
Ohio 	    0 	1 	      2 	3
Colorado 	4 	5 	      6 	7"""

# cutting out ROWS by condition
cond = data['three'] > 5
"""
Ohio        False
Colorado     True
Utah         True
New York     True"""

data[cond]
"""
 	        one 	two 	three 	four
Colorado 	4 	     5 	      6 	7
Utah 	    8 	     9 	      10 	11
New York 	12 	     13 	  14 	15"""

# select COLUMNS and ROWS
data[['one', 'two']][1:]
"""
 	       one 	two
Colorado 	4 	5
Utah 	    8 	9
New York 	12 	13"""

# bool DataFrame
data < 5
"""
 	     one 	two 	three 	four
Ohio 	 True 	True 	True 	True
Colorado True 	False 	False 	False
Utah 	 False 	False 	False 	False
New York False 	False 	False 	False"""

# modify DF through bool DF
data[data < 5] = 0
data
"""
 	     one 	two 	three 	four
Ohio 	 0 	    0 	    0 	    0
Colorado 0 	    5 	    6 	    7
Utah 	 8 	    9     	10 	    11
New York 12 	13 	    14 	    15"""

#@@@@@@@@@ SELECTION FROM DATA FRAME WITH LOC / ILOC @@@@@@@@@@@@@@@@

data
"""
        one 	two 	three 	four
Ohio 	 0 	     1  	2 	    3
Colorado 4 	     5  	6 	    7
Utah 	 8 	     9 	    10 	    11
New York 12 	13 	    14 	    15"""

# Series object from DF
data.loc['Colorado'] 
"""
one      0
two      5
three    6
four     7
Name: Colorado, dtype: int64"""

# new DF-object from DF
data.loc[['Colorado', 'Utah']]
""" 	    one 	two 	three 	four
Colorado 	0 	     5 	      6 	7
Utah 	    8 	     9 	      10 	11"""

# simultaneous sampling rows and columns
data.loc[['Colorado', 'Ohio'], ['two', 'three']]
"""
 	        two 	three
Colorado 	5 	     6
Ohio 	    0 	     0"""

# sampling by integer indexes
data.iloc[2] #          ---> Series, 1 full rows
"""
one       8
two       9
three    10
four     11
Name: Utah, dtype: int64"""

data.iloc[[2, 3]] #      ---> DataFrame, 2 full rows
"""
 	       one 	two 	three 	four
Utah 	    8 	 9 	     10 	11
New York 	12 	13 	     14 	15"""

data.iloc[2, [3, 0, 1]] # ---> Series, 2 row, 3 columns
"""
four    11
one      8
two      9
Name: Utah, dtype: int64"""

data.iloc[[2, 3], [3, 0, 1]]
"""
 	       four 	one 	two
Utah 	    11 	     8 	     9
New York 	15 	     12 	13"""

# slice
data.loc[:'Utah', ['one', 'four']]
"""
 	    one 	four
Ohio 	 0 	     0
Colorado 0 	     7
Utah 	 8 	     11"""

data.iloc[:, :3][data.three > 5]
"""
 	       one 	two 	three
Colorado 	0 	 5 	     6
Utah 	    8 	 9 	     10
New York 	12 	13 	     14"""

#@@@@@@@@@@@@@@  PITFALLS OF INTEGER INDEXING @@@@@@@@@@@@@@

ser = pd.Series(np.arange(3.))
"""
0    0.0
1    1.0
2    2.0"""

# ser[-1] # ERROR! because unclear: we want by index or by mark
ser.iloc[-1] # 2.0

ser2 = pd.Series(np.arange(3.), index = ['a', 'b', 'c'])
"""
a    0.0
b    1.0
c    2.0"""

#ser2[-1] # 2.0
ser[:2]
"""
0    0.0
1    1.0"""

# TOTAL: for indexing use LOC/ILOC

#@@@@@@@@@@@@@ PITFALLS OF CHAIN INDEXING @@@@@@@@@@@@@@@@@@@@@

data
"""
 	     one 	two 	three 	four
Ohio 	 0 	    0 	    0 	    0
Colorado 0 	    5 	    6 	    7
Utah 	 8 	    9     	10 	    11
New York 12 	13 	    14 	    15"""

data.loc[:, 'one'] = 1
data
"""
 	    one two three four
Ohio 	 1 	0 	0 	0
Colorado 1 	5 	6 	7
Utah 	 1 	9 	10 	11
New York 1 	13 	14 	15"""



'\n \t    one two three four\nOhio \t 1 \t0 \t0 \t0\nColorado 1 \t5 \t6 \t7\nUtah \t 1 \t9 \t10 \t11\nNew York 1 \t13 \t14 \t15'

In [39]:
import pandas as pd
import numpy as np

#@@@@@@@@@@@@@@@@@ OPERATION BETWEEN DATAFRAME AND SERIES @@@@@@@@@@@@@@@@@@@

arr = np.arange(12.).reshape((3, 4))
"""array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])"""

arr[0] # array([0., 1., 2., 3.])

arr - arr[0]
"""
array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])"""


frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                    columns = list('bde'),
                    index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
"""
 	     b 	     d  	e
Utah 	0.0 	1.0 	2.0
Ohio 	3.0 	4.0 	5.0
Texas 	6.0 	7.0 	8.0
Oregon 	9.0 	10.0 	11.0"""

series = frame.iloc[0]
"""
b    0.0
d    1.0
e    2.0"""

# if we want to stack ROWS and compare COLUMNS
frame - series
""" 	b 	     d 	     e
Utah 	0.0 	0.0 	0.0
Ohio 	3.0 	3.0 	3.0
Texas 	6.0 	6.0 	6.0
Oregon 	9.0 	9.0 	9.0"""

series2 = pd.Series(np.arange(3), index = ['b', 'e', 'f'])
# reindex result
frame + series2
"""

	       b 	d 	e 	f
Utah 	0.0 	NaN 	3.0 	NaN
Ohio 	3.0 	NaN 	6.0 	NaN
Texas 	6.0 	NaN 	9.0 	NaN
Oregon 	9.0 	NaN 	12.0 	NaN"""

# if we want to stack COLUMNS and compare ROWS
series3 = frame['b']
"""
Utah      0.0
Ohio      3.0
Texas     6.0
Oregon    9.0"""

frame.sub(series3, axis = 'index')
"""
 	    b 	    d 	    e
Utah 	0.0 	1.0 	2.0
Ohio 	0.0 	1.0 	2.0
Texas 	0.0 	1.0 	2.0
Oregon 	0.0 	1.0 	2.0"""


#@@@@@@@@@@@@@@@@@@ FUNCTIONS AND DISPLAY @@@@@@@@@@@@@@@@@@

rng = np.random.default_rng(seed=12345)

frame = pd.DataFrame(rng.standard_normal((4, 3)),
                    columns = list('bde'),
                    index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
"""
          b 	     d 	          e
Utah 	-1.423825 	1.263728 	-0.870662
Ohio 	-0.259173 	-0.075343 	-0.740885
Texas 	-1.367793 	0.648893 	0.361058
Oregon 	-1.952863 	2.347410 	0.968497"""

np.abs(frame) # all working !

# using function for COLUMNS
def f1(x):
    return x.max() - x.min()

frame.apply(f1)
"""
b    1.693690
d    2.422753
e    1.839159"""

# using function for ROWS (!)
frame.apply(f1, axis = 'columns')
"""
Utah      2.687553
Ohio      0.665541
Texas     2.016686
Oregon    4.300273"""

# return Series-object
def f2(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

frame.apply(f2)
"""
          b 	     d 	         e
min 	-1.952863 	-0.075343 	-0.870662
max 	-0.259173 	2.347410 	0.968497"""

def my_format(x):
    return f"{x:.2f}"

# map() is a standart Python method for DataFrame
frame.map(my_format)
"""
         b 	    d 	     e
Utah 	-1.42 	1.26 	-0.87
Ohio 	-0.26 	-0.08 	-0.74
Texas 	-1.37 	0.65 	0.36
Oregon 	-1.95 	2.35 	0.97"""

# map() for Series
frame['e'].map(my_format)
"""
Utah      -0.87
Ohio      -0.74
Texas      0.36
Oregon     0.97"""


#@@@@@@@@@@@@@@@@@@@@@@@@@@ SORTING @@@@@@@@@@@@@@@@@@@@

obj = pd.Series(np.arange(4), index = ['d', 'a', 'b', 'c'])
obj.sort_index()

frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                    index = ['three', 'one'],
                    columns = ['d', 'a', 'c', 'b'])
"""
 	    d 	a 	c 	b
three 	0 	1 	2 	3
one 	4 	5 	6 	7"""

frame.sort_index() # by ROWS (by default)
"""
       d 	a 	c 	b
one 	4 	5 	6 	7
three 	0 	1 	2 	3"""

frame.sort_index(axis = "columns") # by COLUMNS

frame.sort_index(axis = "columns", ascending = False) # sort by decending

# sorting by values
obj = pd.Series([4, 7, -3, 2])
"""
0    4
1    7
2   -3
3    2"""

obj.sort_values() 
"""
2   -3
3    2
0    4
1    7"""

# NaN in the end (by default)
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()
"""
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN"""

# NaN in the begining
obj.sort_values(na_position = "first")
"""
1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0"""

frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
"""
    b 	a
0 	4 	0
1 	7 	1
2 	-3 	0
3 	2 	1"""

frame.sort_values('b')
"""
    b 	a
2 	-3 	0
3 	2 	1
0 	4 	0
1 	7 	1"""

# sort by several columns
frame.sort_values(['a', 'b'])
"""
    b 	a
2 	-3 	0
0 	4 	0
3 	2 	1
1 	7 	1"""

#@@@ RANK() @@@@

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
"""
0    7
1   -5
2    7
3    4
4    2
5    0
6    4"""

obj.rank()
"""
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5"""

obj.rank(method = 'first')
"""
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0"""

obj.rank(ascending = False)
"""
0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5"""

frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],  'c': [-2, 5, 8, -2.5]})
"""
    b 	    a 	 c
0 	4.3 	0 	-2.0
1 	7.0 	1 	 5.0
2 	-3.0 	0 	 8.0
3 	2.0 	1 	-2.5"""

# rank by ROWS (!)
frame.rank(axis = 'columns')

# for undestanding
obj.sort_values()
"""
1   -5
5    0
4    2
3    4
6    4
0    7
2    7"""

obj.sort_values().rank()
"""
1    1.0
5    2.0
4    3.0
3    4.5
6    4.5
0    6.5
2    6.5"""

obj.sort_values().rank(method = 'dense')
"""
1    1.0
5    2.0
4    3.0
3    4.0
6    4.0
0    6.0
2    6.0"""


1    1.0
5    2.0
4    3.0
3    4.0
6    4.0
0    6.0
2    6.0
dtype: float64