In [1]:
# Essential Functionality
#  Reindexing

In [1]:
from pandas import Series,DataFrame
import pandas as pd
import numpy as np

obj=Series([4.5,2.7,1.1,67.8],index=['d','b','a','c'])
obj

d     4.5
b     2.7
a     1.1
c    67.8
dtype: float64

In [2]:
obj.reindex(['a','b','c','d','e'])

a     1.1
b     2.7
c    67.8
d     4.5
e     NaN
dtype: float64

In [3]:
obj.reindex(['a','b','c','d','e'],fill_value=0)

a     1.1
b     2.7
c    67.8
d     4.5
e     0.0
dtype: float64

In [4]:
obj2=Series(['red','blue','yellow'],index=[0,2,4])
obj2.reindex(range(6), method='ffill')

0       red
1       red
2      blue
3      blue
4    yellow
5    yellow
dtype: object

In [5]:
frame1=DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],columns=['Ohio', 'Texas', 'California'])
frame1

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [6]:
frame1.reindex(['a','b','c','d'])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [7]:
states=['Texas', 'Utah', 'California']
frame1.reindex(columns=states)
# As you’ll see soon, reindexing can be done more succinctly by label-indexing with ix:

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [8]:
frame1.ix[['a', 'b', 'c', 'd'], states] # not working

AttributeError: 'DataFrame' object has no attribute 'ix'

In [None]:
# Dropping entries from an axis
obj=Series(np.arange(5),index=["a","b","c","d","e"])
new_obj=obj.drop('c')
new_obj

In [None]:
obj.drop(['c','d'])

In [9]:
data=DataFrame(np.arange(16).reshape(4,4),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [10]:
data.drop(['Ohio'])

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [11]:
print(data.drop(['two'],axis=1))
data.drop(['two', 'four'], axis=1)

          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15


Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [12]:
# indexing, selection, and filtering

obj=Series(np.arange(4.),index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [13]:
print(obj[1])
print(obj['c'])
print(obj[2:4])

1.0
2.0
c    2.0
d    3.0
dtype: float64


In [14]:
print(obj[['a','b']])
print(obj[obj<2])

a    0.0
b    1.0
dtype: float64
a    0.0
b    1.0
dtype: float64


In [15]:
print(obj['b':'d'])
obj['b':'d']=5
obj

b    1.0
c    2.0
d    3.0
dtype: float64


a    0.0
b    5.0
c    5.0
d    5.0
dtype: float64

In [16]:
data=DataFrame(np.arange(16).reshape(4,4),index=['Ohio', 'Colorado', 'Utah', 'New York'],columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [17]:
print(data['two'])
print(data[['two','three']])

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
          two  three
Ohio        1      2
Colorado    5      6
Utah        9     10
New York   13     14


In [18]:
print(data[2:4])
print(data[data['three']>5])

          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [19]:
print(data < 5)
data[data < 5] = 0
data

            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False


Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [20]:
data.ix['Colorado', ['two', 'three']] # ix not working

AttributeError: 'DataFrame' object has no attribute 'ix'

In [21]:
# Arithmetic and data alignment

s1=Series([7.3, -2.5, 3.4, 1.5],index=['a','c','d','e'])
s2 =Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a', 'c', 'e', 'f', 'g'])
print(s2)

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64


In [22]:
 s1*s2 # The internal data alignment introduces NA values in the indices that don’t overlap.
 # Missing values propagate in arithmetic computations.

a   -15.33
c    -9.00
d      NaN
e    -2.25
f      NaN
g      NaN
dtype: float64

In [23]:
df1=DataFrame(np.arange(9).reshape((3,3)),columns=list('bcd'),index=['Ohio', 'Texas', 'Colorado'])
df2=DataFrame(np.arange(12).reshape((4,3)),columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
print(df2)
df1+df2

          b  c  d
Ohio      0  1  2
Texas     3  4  5
Colorado  6  7  8
        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11


Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [24]:
# Arithmetic methods with fill values (SUB-TOPIC)

df1=DataFrame(np.arange(12).reshape((3,4)),columns=list('abcd'))
df2=DataFrame(np.arange(20).reshape((4,5)),columns=list('abcde'))
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [25]:
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [26]:
#  Relatedly, when reindexing a Series or DataFrame, you can also specify a different fill
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,0
1,4,5,6,7,0
2,8,9,10,11,0


In [27]:
#Table 5-7. Flexible arithmetic methods
# Method Description
#  add Method for addition (+)
#  sub Method for subtraction (-)
#  div Method for division (/)
#  mul Method for multiplication (*)

In [28]:
# Operations between DataFrame and Series(SUB-TOPIC)
arr=np.arange(12).reshape((3,4))
print(arr)
arr[0]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


array([0, 1, 2, 3])

In [29]:
arr-arr[0] #This is referred to as broadcasting 
# Operations between a DataFrame and a Series are similar

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [30]:
series2 = Series(range(3), index=['b', 'e', 'f'])
frame + series2

NameError: name 'frame' is not defined

In [31]:
series3 = frame['d']
print(frame)
series3

NameError: name 'frame' is not defined

In [32]:
frame.sub(series3, axis=0)
# The axis number that you pass is the axis to match on. In this case we mean to match
# on the DataFrame’s row index and broadcast across.

NameError: name 'frame' is not defined

In [33]:
# Function application and mapping
import numpy as np
import pandas as pd

frame = pd.DataFrame(np.random.randn(4, 3), columns=list("bde"), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)

               b         d         e
Utah   -1.008549 -1.913869 -0.962713
Ohio   -2.873439  0.022412 -0.071981
Texas   2.771969 -0.321252 -0.334434
Oregon -1.181373 -0.592599  0.517629


In [34]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.008549,1.913869,0.962713
Ohio,2.873439,0.022412,0.071981
Texas,2.771969,0.321252,0.334434
Oregon,1.181373,0.592599,0.517629


In [35]:
# Another frequent operation is applying a function on 1D arrays to each column or row.
# DataFrame’s apply method does exactly this
f = lambda x: x.max() - x.min()

frame.apply(f)
frame.apply(f,axis=1)

Utah      0.951157
Ohio      2.895851
Texas     3.106403
Oregon    1.699002
dtype: float64

In [36]:
def f(x):
    return Series([x.min(),x.max()],index=['min','max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-2.873439,-1.913869,-0.962713
max,2.771969,0.022412,0.517629


In [37]:
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-1.01,-1.91,-0.96
Ohio,-2.87,0.02,-0.07
Texas,2.77,-0.32,-0.33
Oregon,-1.18,-0.59,0.52


In [38]:
frame['e'].map(format)

Utah      -0.96
Ohio      -0.07
Texas     -0.33
Oregon     0.52
Name: e, dtype: object

In [39]:
# Sorting and ranking

In [50]:
obj=Series(range(4), index=['b','d','a','c'])
obj.sort_index()

a    2
b    0
c    3
d    1
dtype: int64

In [48]:
obj1=DataFrame(np.arange(8).reshape(2,4),index=['three','one'],columns=['a','b','d','c'])
print(obj1.sort_index())
print('')
print(obj1.sort_index(axis=1))
print('')
print(obj1.sort_index(axis=1,ascending=False))

       a  b  d  c
one    4  5  6  7
three  0  1  2  3

       a  b  c  d
three  0  1  3  2
one    4  5  7  6

       d  c  b  a
three  2  3  1  0
one    6  7  5  4


In [51]:
# To sort a Series by its values, use its order method

In [54]:
obj = Series([4, 7, -3, 2])
obj.order() #not working

AttributeError: 'Series' object has no attribute 'order'

In [64]:

# On DataFrame, you may want to sort by the values in one or more columns. To do so,
#  pass one or more column names to the "by" option
obj1=DataFrame({'a':[4,87,2,4],'b':[0,4,3,-5]})
obj1.sort_index(by='b')
obj1.sort_index(by=['a','b'])

# not working

SyntaxError: invalid syntax (3911551748.py, line 1)

In [None]:
# Ranking is closely related to sorting, assigning ranks from one through the number of
#  valid data points in an array. It is similar to the indirect sort indices produced by 
# numpy.argsort, except that ties are broken according to a rule. The rank methods for
#  Series and DataFrame are the place to look; by default rank breaks ties by assigning
#  each group the mean rank

In [65]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [66]:
# Ranks can also be assigned according to the order they’re observed in the data
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [67]:
# Naturally, you can rank in descending order, too
obj.rank(ascending=False,method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [73]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [75]:
frame.rank(axis=1)

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [None]:
# Methods      Description
# 'average' Default: assign the average rank to each entry in the equal group.
# 'min'     Use the minimum rank for the whole group.
# 'max'     Use the maximum rank for the whole group.
# 'first'   Assign ranks in the order the values appear in the data.

In [None]:
#  Axis indexes with duplicate values

# Up until now all of the examples I’ve showed you have had unique axis labels (index
#  values). While many pandas functions (like reindex) require that the labels be unique,
#  it’s not mandatory. Let’s consider a small Series with duplicate indices

In [84]:
obj=Series(range(5),index=['a', 'a', 'b', 'b', 'c'])
print(obj['a'])
print("")
print(obj.index.is_unique)

a    0
a    1
dtype: int64

False


In [91]:
frame=DataFrame(np.random.randn(12).reshape(4,3),index=['a', 'a', 'b', 'b'])
frame.index.is_unique

False