# Data Structures

## Series

In [1]:
import pandas as pd

In [2]:
sr = pd.Series([1, 4 ,2, 5, -2])

In [3]:
sr

0    1
1    4
2    2
3    5
4   -2
dtype: int64

In [4]:
sr.array

<NumpyExtensionArray>
[1, 4, 2, 5, -2]
Length: 5, dtype: int64

In [5]:
sr.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
obj = pd.Series([1, 4, 2, 5, -3, 0], index= ['A', 'B', 'C', 'D', 'E', 'F'])

In [7]:
obj

A    1
B    4
C    2
D    5
E   -3
F    0
dtype: int64

In [8]:
sr[1]

4

In [9]:
obj['C']

2

In [10]:
obj[['A', 'B', 'E']]

A    1
B    4
E   -3
dtype: int64

In [11]:
obj + 5

A     6
B     9
C     7
D    10
E     2
F     5
dtype: int64

In [12]:
obj[obj > 2]

B    4
D    5
dtype: int64

In [13]:
import numpy as np

In [14]:
np.sqrt(obj)

  result = getattr(ufunc, method)(*inputs, **kwargs)


A    1.000000
B    2.000000
C    1.414214
D    2.236068
E         NaN
F    0.000000
dtype: float64

In [15]:
2 in obj

False

In [16]:
4 in obj

False

In [17]:
'A' in obj

True

In [18]:
dict_data = {"ohio": 35000, "texas": 71000, "oregon": 16000, "utah": 5000}

In [19]:
obj2 = pd.Series(dict_data)

In [20]:
obj2

ohio      35000
texas     71000
oregon    16000
utah       5000
dtype: int64

In [21]:
obj3 = pd.Series(dict_data, index= ["california", 'ohio', 'oregon', 'texas'])

In [22]:
obj3

california        NaN
ohio          35000.0
oregon        16000.0
texas         71000.0
dtype: float64

In [23]:
obj3.isna()

california     True
ohio          False
oregon        False
texas         False
dtype: bool

In [24]:
pd.isna(obj3)

california     True
ohio          False
oregon        False
texas         False
dtype: bool

In [25]:
obj3.notna()

california    False
ohio           True
oregon         True
texas          True
dtype: bool

In [26]:
obj2 + obj3

california         NaN
ohio           70000.0
oregon         32000.0
texas         142000.0
utah               NaN
dtype: float64

In [27]:
obj2 - obj3

california    NaN
ohio          0.0
oregon        0.0
texas         0.0
utah          NaN
dtype: float64

In [28]:
obj3.name = 'State'
obj3.index.name = 'Population'

In [29]:
obj3

Population
california        NaN
ohio          35000.0
oregon        16000.0
texas         71000.0
Name: State, dtype: float64

In [30]:
sr

0    1
1    4
2    2
3    5
4   -2
dtype: int64

In [31]:
sr.index = ['a', 'b', 'c', 'd', 'e']

In [32]:
sr

a    1
b    4
c    2
d    5
e   -2
dtype: int64

## DataFrame

In [33]:
data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada", "Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
    "year": [2000, 2001, 2002, 2001, 2002, 2003, 2000, 2001, 2002, 2001, 2002, 2003],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

In [34]:
df = pd.DataFrame(data)

In [35]:
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2
6,Ohio,2000,1.5
7,Ohio,2001,1.7
8,Ohio,2002,3.6
9,Nevada,2001,2.4


In [36]:
df.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [37]:
df.tail()

Unnamed: 0,state,year,pop
7,Ohio,2001,1.7
8,Ohio,2002,3.6
9,Nevada,2001,2.4
10,Nevada,2002,2.9
11,Nevada,2003,3.2


In [38]:
pd.DataFrame(data, columns= ['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2
6,2000,Ohio,1.5
7,2001,Ohio,1.7
8,2002,Ohio,3.6
9,2001,Nevada,2.4


In [39]:
df.columns

Index(['state', 'year', 'pop'], dtype='object')

In [40]:
df2 = pd.DataFrame(data, columns= ['state', 'year', 'pop', 'debt'])

In [41]:
df2

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,
1,Ohio,2001,1.7,
2,Ohio,2002,3.6,
3,Nevada,2001,2.4,
4,Nevada,2002,2.9,
5,Nevada,2003,3.2,
6,Ohio,2000,1.5,
7,Ohio,2001,1.7,
8,Ohio,2002,3.6,
9,Nevada,2001,2.4,


In [42]:
df.year

0     2000
1     2001
2     2002
3     2001
4     2002
5     2003
6     2000
7     2001
8     2002
9     2001
10    2002
11    2003
Name: year, dtype: int64

In [43]:
df['year']

0     2000
1     2001
2     2002
3     2001
4     2002
5     2003
6     2000
7     2001
8     2002
9     2001
10    2002
11    2003
Name: year, dtype: int64

In [44]:
df.loc[1]

state    Ohio
year     2001
pop       1.7
Name: 1, dtype: object

In [45]:
df.iloc[3]

state    Nevada
year       2001
pop         2.4
Name: 3, dtype: object

In [46]:
df2.debt = 16.24

In [47]:
df2

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,16.24
1,Ohio,2001,1.7,16.24
2,Ohio,2002,3.6,16.24
3,Nevada,2001,2.4,16.24
4,Nevada,2002,2.9,16.24
5,Nevada,2003,3.2,16.24
6,Ohio,2000,1.5,16.24
7,Ohio,2001,1.7,16.24
8,Ohio,2002,3.6,16.24
9,Nevada,2001,2.4,16.24


In [48]:
df2.loc[11, 'debt'] = 11

In [49]:
df2

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,16.24
1,Ohio,2001,1.7,16.24
2,Ohio,2002,3.6,16.24
3,Nevada,2001,2.4,16.24
4,Nevada,2002,2.9,16.24
5,Nevada,2003,3.2,16.24
6,Ohio,2000,1.5,16.24
7,Ohio,2001,1.7,16.24
8,Ohio,2002,3.6,16.24
9,Nevada,2001,2.4,16.24


In [50]:
df2.debt = np.random.standard_normal(12)

In [51]:
df2

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,-0.139811
1,Ohio,2001,1.7,0.129747
2,Ohio,2002,3.6,0.260124
3,Nevada,2001,2.4,-0.352505
4,Nevada,2002,2.9,-0.474098
5,Nevada,2003,3.2,2.296398
6,Ohio,2000,1.5,1.464099
7,Ohio,2001,1.7,-0.045526
8,Ohio,2002,3.6,-2.821012
9,Nevada,2001,2.4,-1.561392


In [52]:
df2['val'] = 2

In [53]:
df2

Unnamed: 0,state,year,pop,debt,val
0,Ohio,2000,1.5,-0.139811,2
1,Ohio,2001,1.7,0.129747,2
2,Ohio,2002,3.6,0.260124,2
3,Nevada,2001,2.4,-0.352505,2
4,Nevada,2002,2.9,-0.474098,2
5,Nevada,2003,3.2,2.296398,2
6,Ohio,2000,1.5,1.464099,2
7,Ohio,2001,1.7,-0.045526,2
8,Ohio,2002,3.6,-2.821012,2
9,Nevada,2001,2.4,-1.561392,2


In [54]:
del df2['val']

In [55]:
df2

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,-0.139811
1,Ohio,2001,1.7,0.129747
2,Ohio,2002,3.6,0.260124
3,Nevada,2001,2.4,-0.352505
4,Nevada,2002,2.9,-0.474098
5,Nevada,2003,3.2,2.296398
6,Ohio,2000,1.5,1.464099
7,Ohio,2001,1.7,-0.045526
8,Ohio,2002,3.6,-2.821012
9,Nevada,2001,2.4,-1.561392


In [56]:
df2.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
year,2000,2001,2002,2001,2002,2003,2000,2001,2002,2001,2002,2003
pop,1.5,1.7,3.6,2.4,2.9,3.2,1.5,1.7,3.6,2.4,2.9,3.2
debt,-0.139811,0.129747,0.260124,-0.352505,-0.474098,2.296398,1.464099,-0.045526,-2.821012,-1.561392,-0.909285,1.466267


In [57]:
population = {
    'ohio': {2000: 1.5, 2001: 3.4, 2002: 3.6},
    'texas': {2001: 2.7, 2002: 3.1}
}

In [58]:
df3 = pd.DataFrame(population, index= [2000, 2001, 2002])

In [59]:
df3

Unnamed: 0,ohio,texas
2000,1.5,
2001,3.4,2.7
2002,3.6,3.1


In [60]:
df3.index.name = 'year'

In [61]:
df3

Unnamed: 0_level_0,ohio,texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,3.4,2.7
2002,3.6,3.1


In [62]:
df3.columns.name = 'states'

In [63]:
df3

states,ohio,texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,3.4,2.7
2002,3.6,3.1


In [64]:
df3.to_dict()

{'ohio': {2000: 1.5, 2001: 3.4, 2002: 3.6},
 'texas': {2000: nan, 2001: 2.7, 2002: 3.1}}

In [65]:
df3.to_numpy()

array([[1.5, nan],
       [3.4, 2.7],
       [3.6, 3.1]])

In [66]:
df2.to_numpy()

array([['Ohio', 2000, 1.5, -0.13981088886029305],
       ['Ohio', 2001, 1.7, 0.12974674692675572],
       ['Ohio', 2002, 3.6, 0.26012353110926073],
       ['Nevada', 2001, 2.4, -0.35250469697983855],
       ['Nevada', 2002, 2.9, -0.4740976618734413],
       ['Nevada', 2003, 3.2, 2.2963984545687066],
       ['Ohio', 2000, 1.5, 1.4640989745662447],
       ['Ohio', 2001, 1.7, -0.04552645576886454],
       ['Ohio', 2002, 3.6, -2.821011681355999],
       ['Nevada', 2001, 2.4, -1.561391536196985],
       ['Nevada', 2002, 2.9, -0.9092854153518424],
       ['Nevada', 2003, 3.2, 1.4662672523385807]], dtype=object)

## Index 

In [67]:
labels = pd.Index(np.arange(4))

In [68]:
labels

Index([0, 1, 2, 3], dtype='int32')

In [69]:
#labels[1] = 'a'     #TypeError: Index does not support mutable operations

In [70]:
df3

states,ohio,texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,3.4,2.7
2002,3.6,3.1


In [71]:
'ohio' in df3.columns

True

In [72]:
2004 in df3.index

False

In [73]:
lable2 = pd.Index(np.arange(2, 8))

In [74]:
labels.append(lable2)

Index([0, 1, 2, 3, 2, 3, 4, 5, 6, 7], dtype='int32')

In [75]:
labels.difference(lable2)

Index([0, 1], dtype='int32')

In [76]:
labels.union(lable2)

Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int32')

In [77]:
labels.intersection(lable2)

Index([2, 3], dtype='int32')

In [78]:
labels.isin(lable2)

array([False, False,  True,  True])

In [79]:
labels.insert?

[1;31mSignature:[0m [0mlabels[0m[1;33m.[0m[0minsert[0m[1;33m([0m[0mloc[0m[1;33m:[0m [1;34m'int'[0m[1;33m,[0m [0mitem[0m[1;33m)[0m [1;33m->[0m [1;34m'Index'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Make new Index inserting new item at location.

Follows Python numpy.insert semantics for negative values.

Parameters
----------
loc : int
item : object

Returns
-------
Index

Examples
--------
>>> idx = pd.Index(['a', 'b', 'c'])
>>> idx.insert(1, 'x')
Index(['a', 'x', 'b', 'c'], dtype='object')
[1;31mFile:[0m      c:\users\harshit\appdata\local\programs\python\python312\lib\site-packages\pandas\core\indexes\base.py
[1;31mType:[0m      method

In [80]:
labels.insert(0, -1)

Index([-1, 0, 1, 2, 3], dtype='int32')

In [81]:
labels.insert(4, 8)

Index([0, 1, 2, 3, 8], dtype='int32')

In [82]:
labels.is_unique

True

In [83]:
labels.unique()

Index([0, 1, 2, 3], dtype='int32')

# Essential Functionalities

In [84]:
obj = pd.Series([6, 5 ,4, 3], index= ['b', 'd', 'a', 'c'])

In [85]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [86]:
obj2

a    4.0
b    6.0
c    3.0
d    5.0
e    NaN
dtype: float64

In [87]:
obj3 = pd.Series(['blue', 'green', 'red'])

In [88]:
obj3

0     blue
1    green
2      red
dtype: object

In [89]:
obj4 = obj3.reindex(np.arange(6), method= 'ffill')

In [90]:
obj4

0     blue
1    green
2      red
3      red
4      red
5      red
dtype: object

In [91]:
obj4 = obj3.reindex(np.arange(9), method= 'bfill')

In [92]:
obj4

0     blue
1    green
2      red
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
8      NaN
dtype: object

In [93]:
frame = df3.reindex(index= [2000, 2001, 2002, 2003], columns= ['california', 'texas'])

In [94]:
frame

states,california,texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,
2001,,2.7
2002,,3.1
2003,,


In [95]:
frame.reindex(['california', 'texas', 'ohio'], axis= 'columns')

states,california,texas,ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,,,
2001,,2.7,
2002,,3.1,
2003,,,


In [96]:
frame.reindex?

[1;31mSignature:[0m
[0mframe[0m[1;33m.[0m[0mreindex[0m[1;33m([0m[1;33m
[0m    [0mlabels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mindex[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcolumns[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0maxis[0m[1;33m:[0m [1;34m'Axis | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmethod[0m[1;33m:[0m [1;34m'ReindexMethod | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcopy[0m[1;33m:[0m [1;34m'bool | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlevel[0m[1;33m:[0m [1;34m'Level | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfill_value[0m[1;33m:[0m [1;34m'Scalar | None'[0m [1;33m=[0m [0mnan[0m[1;33m,[0m[1;33m
[0m    [0mlimit[0m[1;33m:[0m [1;34m'int | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtolera

## Dropping Entries from an Axis

In [97]:
obj = pd.Series(np.arange(5), index= [chr(i) for i in range(65, 70)])

In [98]:
obj

A    0
B    1
C    2
D    3
E    4
dtype: int32

In [99]:
obj.drop('A')     #returns the object doesn't change the actual object

B    1
C    2
D    3
E    4
dtype: int32

In [100]:
obj.drop(['D', 'E'])

A    0
B    1
C    2
dtype: int32

In [101]:
new_object = obj.drop('A')

In [102]:
new_object

B    1
C    2
D    3
E    4
dtype: int32

In [103]:
data = pd.DataFrame(np.arange(16).reshape(4, 4), index= ['ohio', 'colorado', 'texas', 'new york'], columns= ['one', 'two', 'three', 'four'])

In [104]:
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
texas,8,9,10,11
new york,12,13,14,15


In [105]:
data.drop(index= 'ohio')

Unnamed: 0,one,two,three,four
colorado,4,5,6,7
texas,8,9,10,11
new york,12,13,14,15


In [106]:
data.drop(columns= ['one', 'two'])

Unnamed: 0,three,four
ohio,2,3
colorado,6,7
texas,10,11
new york,14,15


In [107]:
data.drop('ohio', axis= 0)

Unnamed: 0,one,two,three,four
colorado,4,5,6,7
texas,8,9,10,11
new york,12,13,14,15


In [108]:
data.drop('one', axis= 1)

Unnamed: 0,two,three,four
ohio,1,2,3
colorado,5,6,7
texas,9,10,11
new york,13,14,15


## Indexing, Selection, and Filtering

In [109]:
obj

A    0
B    1
C    2
D    3
E    4
dtype: int32

In [110]:
# obj[0] 
# FutureWarning: Series.__getitem__ treating keys as positions is deprecated. 
# In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). 
# To access a value by position, use `ser.iloc[pos]`

In [111]:
obj['A']

0

In [112]:
obj[2:4]

C    2
D    3
dtype: int32

In [113]:
obj.iloc[1]    #use this instead of obj[1]

1

In [114]:
obj[['A', 'B', 'E']]

A    0
B    1
E    4
dtype: int32

In [115]:
obj[::-1]

E    4
D    3
C    2
B    1
A    0
dtype: int32

In [116]:
obj[obj > 2]

D    3
E    4
dtype: int32

### preferred way

In [117]:
obj.loc[['A', 'B', 'E']]

A    0
B    1
E    4
dtype: int32

In [118]:
obj.loc['B' : 'D']    #endpoint is also inclusive in this type of slicing

B    1
C    2
D    3
dtype: int32

In [119]:
obj.iloc[1 : 4]

B    1
C    2
D    3
dtype: int32

In [120]:
obj.iloc[::-1]

E    4
D    3
C    2
B    1
A    0
dtype: int32

In [121]:
obj.loc[obj > 2]

D    3
E    4
dtype: int32

### in DataFrame loc and iloc are prefered as well

In [122]:
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
texas,8,9,10,11
new york,12,13,14,15


In [123]:
data['two']

ohio         1
colorado     5
texas        9
new york    13
Name: two, dtype: int32

In [124]:
data[['three', 'four']]

Unnamed: 0,three,four
ohio,2,3
colorado,6,7
texas,10,11
new york,14,15


In [125]:
data[:2]

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7


In [126]:
data > 5

Unnamed: 0,one,two,three,four
ohio,False,False,False,False
colorado,False,False,True,True
texas,True,True,True,True
new york,True,True,True,True


In [127]:
data['three'] > 5

ohio        False
colorado     True
texas        True
new york     True
Name: three, dtype: bool

In [128]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
colorado,4,5,6,7
texas,8,9,10,11
new york,12,13,14,15


### using loc and iloc

In [129]:
data.loc['colorado']

one      4
two      5
three    6
four     7
Name: colorado, dtype: int32

In [130]:
data.loc['colorado', ['two', 'four']]

two     5
four    7
Name: colorado, dtype: int32

In [131]:
data.loc[data['three'] > 5]

Unnamed: 0,one,two,three,four
colorado,4,5,6,7
texas,8,9,10,11
new york,12,13,14,15


In [132]:
data.loc['colorado':'texas']

Unnamed: 0,one,two,three,four
colorado,4,5,6,7
texas,8,9,10,11


In [133]:
data.loc[['ohio', 'texas'], 'two':]

Unnamed: 0,two,three,four
ohio,1,2,3
texas,9,10,11


In [134]:
data.iloc[2] = 5

In [135]:
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
texas,5,5,5,5
new york,12,13,14,15


In [136]:
data.iloc[:, :3][data['three'] >= 5]    #chained indexing

Unnamed: 0,one,two,three
colorado,4,5,6
texas,5,5,5
new york,12,13,14


In [137]:
data.at['ohio', 'one']

0

In [138]:
data.iat[2, 0]

5

### Integers indexing pitfalls

In [139]:
ser = pd.Series(np.arange(5.))

In [140]:
ser

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [141]:
# ser[-1]   it will throw KeyError: -1
ser.iloc[-1]

4.0

### Pitfalls with chained indexing

In [142]:
# data.loc[data.three == 5]['three'] = 6      #SettingWithCopyWarning
data.loc[data.three == 5, 'three'] = 6

In [143]:
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
texas,5,5,6,5
new york,12,13,14,15


## Arithmetic and Data Alignment

In [144]:
s1 = pd.Series(np.random.standard_normal(5), index= [chr(i) for i in range(65, 70)])

In [145]:
s1

A    0.028137
B   -0.273896
C    1.799695
D   -0.785012
E    0.380279
dtype: float64

In [146]:
s2 = pd.Series(np.random.standard_normal(4), index= [chr(i) for i in range(67, 71)])

In [147]:
s2

C    0.205992
D   -1.100362
E    0.429413
F   -0.210545
dtype: float64

In [148]:
s1 + s2

A         NaN
B         NaN
C    2.005687
D   -1.885373
E    0.809692
F         NaN
dtype: float64

In [149]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3), index= list('abc'))

In [150]:
df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index= list('bcde'))

In [151]:
df1

Unnamed: 0,0,1,2
a,0,1,2
b,3,4,5
c,6,7,8


In [152]:
df2

Unnamed: 0,0,1,2,3
b,0,1,2,3
c,4,5,6,7
d,8,9,10,11
e,12,13,14,15


In [153]:
df1 + df2

Unnamed: 0,0,1,2,3
a,,,,
b,3.0,5.0,7.0,
c,10.0,12.0,14.0,
d,,,,
e,,,,


### Arithmetic methods with fill values

In [154]:
df1.add(df2, fill_value= 0)     
#if doesn't get value from both data frames then it will place 0 else if it get value from one frame then it will place that

Unnamed: 0,0,1,2,3
a,0.0,1.0,2.0,
b,3.0,5.0,7.0,3.0
c,10.0,12.0,14.0,7.0
d,8.0,9.0,10.0,11.0
e,12.0,13.0,14.0,15.0


In [155]:
df1.reindex(columns= df2.columns, fill_value= 0)

Unnamed: 0,0,1,2,3
a,0,1,2,0
b,3,4,5,0
c,6,7,8,0


### arithmetic funcs

In [156]:
1 / df1

Unnamed: 0,0,1,2
a,inf,1.0,0.5
b,0.333333,0.25,0.2
c,0.166667,0.142857,0.125


In [157]:
df1.rdiv(1)

Unnamed: 0,0,1,2
a,inf,1.0,0.5
b,0.333333,0.25,0.2
c,0.166667,0.142857,0.125


In [158]:
df1.add(df2)

Unnamed: 0,0,1,2,3
a,,,,
b,3.0,5.0,7.0,
c,10.0,12.0,14.0,
d,,,,
e,,,,


In [159]:
df1.radd(df2)

Unnamed: 0,0,1,2,3
a,,,,
b,3.0,5.0,7.0,
c,10.0,12.0,14.0,
d,,,,
e,,,,


In [160]:
df1.sub(df2)

Unnamed: 0,0,1,2,3
a,,,,
b,3.0,3.0,3.0,
c,2.0,2.0,2.0,
d,,,,
e,,,,


In [161]:
df1.rsub(df2)

Unnamed: 0,0,1,2,3
a,,,,
b,-3.0,-3.0,-3.0,
c,-2.0,-2.0,-2.0,
d,,,,
e,,,,


### Operations between DataFrame and Series

In [162]:
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
texas,5,5,6,5
new york,12,13,14,15


In [163]:
series = data.iloc[0]

In [164]:
series

one      0
two      1
three    2
four     3
Name: ohio, dtype: int32

In [165]:
data - series

Unnamed: 0,one,two,three,four
ohio,0,0,0,0
colorado,4,4,4,4
texas,5,4,4,2
new york,12,12,12,12


In [166]:
series2 = pd.Series(np.arange(3), index= ['one', 'two', 'five'])

In [167]:
series2

one     0
two     1
five    2
dtype: int32

In [168]:
data + series2

Unnamed: 0,five,four,one,three,two
ohio,,,0.0,,2.0
colorado,,,4.0,,6.0
texas,,,5.0,,6.0
new york,,,12.0,,14.0


In [169]:
series3 = data.loc[:, 'one']

In [170]:
series3

ohio         0
colorado     4
texas        5
new york    12
Name: one, dtype: int32

In [171]:
data.sub(series3, axis= 'index')

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,0,1,2,3
texas,0,0,1,0
new york,0,1,2,3


## Function Application and Mapping

In [172]:
frame = pd.DataFrame(np.random.standard_normal((4, 3)), index= ['ohio', 'texas', 'utas', 'yankton'], columns= list('bde'))

In [173]:
frame

Unnamed: 0,b,d,e
ohio,-0.75771,-0.144361,-0.490735
texas,-1.959289,-1.325049,0.492364
utas,-0.74467,0.868626,-0.956877
yankton,0.751075,0.827658,0.393315


In [174]:
frame = np.abs(frame)

In [175]:
frame

Unnamed: 0,b,d,e
ohio,0.75771,0.144361,0.490735
texas,1.959289,1.325049,0.492364
utas,0.74467,0.868626,0.956877
yankton,0.751075,0.827658,0.393315


In [176]:
def func(x):
    return x.max() - x.min()

In [177]:
frame.apply(func)    #passes the columns of DataFrame 1 by 1 to the func and returns the value as a series

b    1.214619
d    1.180688
e    0.563562
dtype: float64

In [178]:
frame.apply(func, axis= 'columns')

ohio       0.613349
texas      1.466924
utas       0.212207
yankton    0.434344
dtype: float64

In [179]:
def func2(x):
    return pd.Series([x.max(), x.min()], index= ['max', 'min'])

In [180]:
frame.apply(func2)

Unnamed: 0,b,d,e
max,1.959289,1.325049,0.956877
min,0.74467,0.144361,0.393315


In [181]:
def round_val(x):
    return round(x, 2)

In [182]:
frame.apply(round_val)

Unnamed: 0,b,d,e
ohio,0.76,0.14,0.49
texas,1.96,1.33,0.49
utas,0.74,0.87,0.96
yankton,0.75,0.83,0.39


In [183]:
frame.map(round_val)

Unnamed: 0,b,d,e
ohio,0.76,0.14,0.49
texas,1.96,1.33,0.49
utas,0.74,0.87,0.96
yankton,0.75,0.83,0.39


## sorting and ranking

In [184]:
obj = pd.Series(np.arange(4), index= list('dbca'))

In [185]:
obj

d    0
b    1
c    2
a    3
dtype: int32

In [186]:
obj.sort_index()

a    3
b    1
c    2
d    0
dtype: int32

In [187]:
obj.sort_values()

d    0
b    1
c    2
a    3
dtype: int32

In [188]:
obj.sort_values(ascending= False)

a    3
c    2
b    1
d    0
dtype: int32

In [189]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index= ['three', 'one'], columns= list('dabc'))

In [190]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [191]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [192]:
frame.sort_index(axis= 'columns')

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [193]:
obj = pd.Series([1, -4, np.nan, 5, 0, np.nan])

In [194]:
obj.sort_values()

1   -4.0
4    0.0
0    1.0
3    5.0
2    NaN
5    NaN
dtype: float64

In [195]:
obj.sort_values(na_position= 'first')

2    NaN
5    NaN
1   -4.0
4    0.0
0    1.0
3    5.0
dtype: float64

In [196]:
frame = pd.DataFrame({'b': [4, -2, 3, 0], 'a': [1, 4, -3, 9]})

In [197]:
frame

Unnamed: 0,b,a
0,4,1
1,-2,4
2,3,-3
3,0,9


In [198]:
frame.sort_values('b')

Unnamed: 0,b,a
1,-2,4
3,0,9
2,3,-3
0,4,1


In [199]:
frame.sort_values('a', ascending= False)

Unnamed: 0,b,a
3,0,9
1,-2,4
0,4,1
2,3,-3


In [200]:
frame.sort_values(['b', 'a'])   
#prioritize the first column passed and sort accordingly if the next column has a smaller value at same index then sorts according to the next column

Unnamed: 0,b,a
1,-2,4
3,0,9
2,3,-3
0,4,1


### Ranking

In [202]:
obj = pd.Series([7, -5, 7, 8, 9, 0, 6])

In [203]:
obj

0    7
1   -5
2    7
3    8
4    9
5    0
6    6
dtype: int64

In [204]:
obj.rank()

0    4.5
1    1.0
2    4.5
3    6.0
4    7.0
5    2.0
6    3.0
dtype: float64