Essential Functionallity

In [2]:
import numpy as np
import pandas as pd

data = {"State": ["Delhi","MP","Bihar","UP"],
        "Name": ["Ananya","Muskan","Sristi","Utkarsh"],
        "Year": [2003,2001,2002,2000]}
df = pd.DataFrame(data,index=[1,2,3,4])
df

Unnamed: 0,State,Name,Year
1,Delhi,Ananya,2003
2,MP,Muskan,2001
3,Bihar,Sristi,2002
4,UP,Utkarsh,2000


In [2]:
df.columns.name = "info"
df.index.name = "S.no"
df

info,State,Name,Year
S.no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Delhi,Ananya,2003
2,MP,Muskan,2001
3,Bihar,Sristi,2002
4,UP,Utkarsh,2000


In [3]:
# 1. Reindexing
df.reindex([2,1,4,3,5])

info,State,Name,Year
S.no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,MP,Muskan,2001.0
1,Delhi,Ananya,2003.0
4,UP,Utkarsh,2000.0
3,Bihar,Sristi,2002.0
5,,,


In [4]:
# When working with time series data, we avoid NaN values and thus use method="ffil" option to fill
# It replaces the NULL values with the value from the previous row.

df.reindex(np.arange(7), method="ffill")[1:]

info,State,Name,Year
S.no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Delhi,Ananya,2003.0
2,MP,Muskan,2001.0
3,Bihar,Sristi,2002.0
4,UP,Utkarsh,2000.0
5,UP,Utkarsh,2000.0
6,UP,Utkarsh,2000.0


In [5]:
# In DataFrame, reindex can alter the index of rows(index=) and names of columns(columns=)
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index=["a","c","d"],columns=["col1","col2","col3"])
frame

Unnamed: 0,col1,col2,col3
a,0,1,2
c,3,4,5
d,6,7,8


In [6]:
frame.reindex(index=["a","b","c","d"],columns=["col1","col4","col3"])
#because col2 wasn't present the data from it is dropped from the result

Unnamed: 0,col1,col4,col3
a,0.0,,2.0
b,,,
c,3.0,,5.0
d,6.0,,8.0


In [7]:
frame.reindex(labels=["A","c","C","D"])
# reindex enables using new indexes whereas loc doesn't

Unnamed: 0,col1,col2,col3
A,,,
c,3.0,4.0,5.0
C,,,
D,,,


Dropping Entries from an Axis


In [8]:
# drop method
obj = pd.DataFrame(np.arange(16).reshape((4,4)), index=[22,33,44,55], columns=["a","b","c","d"])
obj

Unnamed: 0,a,b,c,d
22,0,1,2,3
33,4,5,6,7
44,8,9,10,11
55,12,13,14,15


In [9]:
obj2 = obj.drop(22)
obj2

Unnamed: 0,a,b,c,d
33,4,5,6,7
44,8,9,10,11
55,12,13,14,15


In [10]:
obj2.drop(columns=["a","b"])

Unnamed: 0,c,d
33,6,7
44,10,11
55,14,15


In [11]:
obj2[33] = 33.33333 #creates copy
obj

Unnamed: 0,a,b,c,d
22,0,1,2,3
33,4,5,6,7
44,8,9,10,11
55,12,13,14,15


In [12]:
obj2

Unnamed: 0,a,b,c,d,33
33,4,5,6,7,33.33333
44,8,9,10,11,33.33333
55,12,13,14,15,33.33333


Indexing, Selection and Filtering

In [13]:
# Simliar to sequence, only that indexes can be values insted of only using integers
series = pd.Series(np.arange(4.),index=["a","b","c","d"])
series

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [14]:
series["b"]

1.0

In [15]:
# or
series[1]

  series[1]


1.0

In [16]:
# slicing is applicable in Series
series[2:4]

c    2.0
d    3.0
dtype: float64

In [17]:
# To get the values from series by speifying the indexes in certain way
series[["d","a","b"]] # Notice: [[]] the indices must be passed in the form of list

d    3.0
a    0.0
b    1.0
dtype: float64

In [18]:
series[[3,0,1]] # Zero based indexing, Notice here how 3rd index corresponds to "d" in the Series

  series[[3,0,1]] # Zero based indexing, Notice here how 3rd index corresponds to "d" in the Series


d    3.0
a    0.0
b    1.0
dtype: float64

In [19]:
# Boolean indexing
series[series<2]

a    0.0
b    1.0
dtype: float64

In [5]:
# Another way of indexing: Using "loc" and "iloc"

# loc: it can be used to select/retrieve the data, when the labels/indexes in Series are integers and not labels
series1 = pd.Series(["Hey","Hi","Hello"], index=[1,0,2])
series1

1      Hey
0       Hi
2    Hello
dtype: object

In [6]:
series2 = pd.Series(["Ya","Yayaya","yayyy"], index=["a","b","d"])
series2

a        Ya
b    Yayaya
d     yayyy
dtype: object

In [7]:
series1.loc[[1,2]]

1      Hey
2    Hello
dtype: object

In [9]:
# series2.loc[[0,1]] -> this will not work
series2.loc[["a","b"]]

a        Ya
b    Yayaya
dtype: object

In [24]:
# To be able to use numerical indexing even for Series with indexes as string/labels we use: iloc
series2.iloc[[1,2]]

b    Yayaya
d     yayyy
dtype: object

In [25]:
# when slicing with strings, then end points are also inclusive
series2["a":"d"]

a        Ya
b    Yayaya
d     yayyy
dtype: object

In [10]:
## Indexing of DataFrame
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index=["Ohio", "Colorado", "Utah", "New York"], 
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [13]:
# retrieval of single element, Only coumns can be retrieved this way
# data.iloc[0] 
data["one"]

Ohio         0
Colorado     4
Utah         8
New York    12
Name: one, dtype: int32

In [28]:
# For multiple column retrieval, use[[]]
data[["one","two"]]

Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5
Utah,8,9
New York,12,13


In [29]:
# For selecting rows we can use slicing
data[:]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [30]:
# Boolean DataFrame
data > 5

Unnamed: 0,one,two,three,four
Ohio,False,False,False,False
Colorado,False,False,True,True
Utah,True,True,True,True
New York,True,True,True,True


In [31]:
# Selection of DataFrame using loc: allows easy label based retrieval and iloc: allows easy index(integer) based retrieval
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [17]:
data.loc["Ohio"]

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int32

In [33]:
data.loc[["New York","Utah"]]

Unnamed: 0,one,two,three,four
New York,12,13,14,15
Utah,8,9,10,11


In [34]:
data.loc[["New York","Utah"],["one","two"]]

Unnamed: 0,one,two
New York,12,13
Utah,8,9


In [35]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [36]:
# iloc
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [37]:
data.iloc[[2,3]]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [38]:
data.iloc[[2,3,1],[1,1,1,2]]

Unnamed: 0,two,two.1,two.2,three
Utah,9,9,9,10
New York,13,13,13,14
Colorado,5,5,5,6


In [39]:
# Both indexing functions(loc or iloc) work with slices in addition to single labels or lists of labels:
data.loc[:"Utah","two"]

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

In [40]:
data.loc[:"Utah",["two","one"]]

Unnamed: 0,two,one
Ohio,1,0
Colorado,5,4
Utah,9,8


In [41]:
# NOTICE AN IMPORTANT DIFFERENCE BELOW
data.iloc[:,1:] #col remove-wise

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


In [42]:
data.iloc[1:] #row remove-wise

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [43]:
# Boolean array can be used with iloc and loc wlong with dot notation to access columns
data.loc[data.three>3]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [44]:
# For further-a-do
# https://wesmckinney.com/book/pandas-basics#tbl-table_dataframe_loc_iloc

Integer Indexing Pitfalls

In [45]:
# The main conclusion is that use loc (for labels) or iloc (for integers)
# Below are the errors that we can get if we dont follow this (loc-iloc) convection
ser=pd.Series(np.arange(5),index=[1,2,3,6,7])
ser[1]

0

In [46]:
ser[-1] # This will not work and thus will generate an error.
# This would have worked if indixes of Series would be strings/labels


KeyError: -1

In [None]:
series2[-1]

  series2[-1]


'yayyy'

In [47]:
# solution: use iloc
ser.iloc[-1]

4

Pitfalls with chained indexing

In [48]:
# Using loc and iloc as assignment
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [49]:
data.loc["Ohio"] = 88
data

Unnamed: 0,one,two,three,four
Ohio,88,88,88,88
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [50]:
data.iloc[:1] = 99
data

Unnamed: 0,one,two,three,four
Ohio,99,99,99,99
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [51]:
data.iloc[:,:1]=100 # ":" ke baad means working on colums
data

Unnamed: 0,one,two,three,four
Ohio,100,99,99,99
Colorado,100,5,6,7
Utah,100,9,10,11
New York,100,13,14,15


In [52]:
data.loc[:,"one"] = 101
data

Unnamed: 0,one,two,three,four
Ohio,101,99,99,99
Colorado,101,5,6,7
Utah,101,9,10,11
New York,101,13,14,15


In [53]:
data.iloc[2] = 5
data

Unnamed: 0,one,two,three,four
Ohio,101,99,99,99
Colorado,101,5,6,7
Utah,5,5,5,5
New York,101,13,14,15


In [54]:
data # original data is modified

Unnamed: 0,one,two,three,four
Ohio,101,99,99,99
Colorado,101,5,6,7
Utah,5,5,5,5
New York,101,13,14,15


Arithmetic and Data Alignment

In [74]:
# simpler to work with objects that have different indexes. eg, in case of adding the resulting indexes will the union
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=["a", "c", "e", "f", "g"])
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [103]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list("bcd"), index=["Delhi","Mumbai","Indore"])
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list("abc"), index=["Pune","Indore","Delhi","Mumbai"])
df1+df2 #intersection wale gets added while other remaining in union becomes NaN    

Unnamed: 0,a,b,c,d
Delhi,,7.0,9.0,
Indore,,10.0,12.0,
Mumbai,,13.0,15.0,
Pune,,,,


In [104]:
# if you add DataFrames/Series with no column or row label, then resulting df/ser will contain NaNs
d1 = pd.DataFrame({"muki":[1,2]})
d1

Unnamed: 0,muki
0,1
1,2


In [105]:
d2 = pd.DataFrame({"lolko":[9,8]})
d2

Unnamed: 0,lolko
0,9
1,8


In [106]:
d1+d2 # indexes common hote hue bhi nhi, Thus adds only when indexes are manually provided

Unnamed: 0,lolko,muki
0,,
1,,


In [107]:
# Adjusting the null NaN values
# to make a value a NaN: np.nan
d2[:1] = np.nan
d2

Unnamed: 0,lolko
0,
1,8.0


In [108]:
# To remove NaN value: fill_value=val
df1.add(df2,fill_value = 0.0)

Unnamed: 0,a,b,c,d
Delhi,6.0,7.0,9.0,2.0
Indore,3.0,10.0,12.0,8.0
Mumbai,9.0,13.0,15.0,5.0
Pune,0.0,1.0,2.0,


In [109]:
df1.subtract(df2,fill_value = 0.0)
df1
# For further a-do: https://wesmckinney.com/book/pandas-basics#tbl-table_flex_arith

Unnamed: 0,b,c,d
Delhi,0.0,1.0,2.0
Mumbai,3.0,4.0,5.0
Indore,6.0,7.0,8.0


In [111]:
# arithematic operations between different dimensions. Here DataFrame and Series
df = pd.DataFrame(np.arange(12).reshape((4,3)), index=["A","B","C","D"], columns=list("abc"))
df

Unnamed: 0,a,b,c
A,0,1,2
B,3,4,5
C,6,7,8
D,9,10,11


In [112]:
ser = df.iloc[0]
ser

a    0
b    1
c    2
Name: A, dtype: int32

In [113]:
df - ser

Unnamed: 0,a,b,c
A,0,0,0
B,3,3,3
C,6,6,6
D,9,9,9


In [118]:
# if series contain different row/column then NaN aajayega
ser = np.array([0,9,8,4])
ser

array([0, 9, 8, 4])

In [120]:
df.add(ser,axis="index") # here we have added column-wise.

Unnamed: 0,a,b,c
A,0,1,2
B,12,13,14
C,14,15,16
D,13,14,15
