Function Application and Mapping

In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.standard_normal((4,3)), columns=list("abc"), index=["India","Bharat","Indraprastha","Aryavarta"])
df

Unnamed: 0,a,b,c
India,-1.150834,0.766867,0.67049
Bharat,-2.397443,1.319416,-0.360726
Indraprastha,-0.187029,0.112712,-0.340618
Aryavarta,-1.819753,-0.160073,-0.172333


In [4]:
df.abs()

Unnamed: 0,a,b,c
India,1.150834,0.766867,0.67049
Bharat,2.397443,1.319416,0.360726
Indraprastha,0.187029,0.112712,0.340618
Aryavarta,1.819753,0.160073,0.172333


In [8]:
def f1(x):
    return x.max() - x.min()

# To apply funtions to the DataFrames/Series we use: "apply"
df.apply(f1) # default: apply across row, i.e, compution rows   

a    2.210414
b    1.479489
c    1.031216
dtype: float64

In [10]:
df.apply(f1, axis="columns") # To apply across columns we mention the axis

India           1.917701
Bharat          3.716859
Indraprastha    0.453330
Aryavarta       1.659679
dtype: float64

In [18]:
# It can also return dataFrame instead of series(in above exampples)
def f2(x):
    return pd.Series([x.max(), x.min()], index=["max","min"])
df

Unnamed: 0,a,b,c
India,-1.150834,0.766867,0.67049
Bharat,-2.397443,1.319416,-0.360726
Indraprastha,-0.187029,0.112712,-0.340618
Aryavarta,-1.819753,-0.160073,-0.172333


In [21]:
df.apply(f2)

Unnamed: 0,a,b,c
max,-0.187029,1.319416,0.67049
min,-2.397443,-0.160073,-0.360726


In [23]:
df.apply(f2, axis="columns")

Unnamed: 0,max,min
India,0.766867,-1.150834
Bharat,1.319416,-2.397443
Indraprastha,0.112712,-0.340618
Aryavarta,-0.160073,-1.819753


Sorting and Ranking

In [29]:
# Sorting by indexing sorting: sort_index() for DataFrame and Series
df

Unnamed: 0,a,b,c
India,-1.150834,0.766867,0.67049
Bharat,-2.397443,1.319416,-0.360726
Indraprastha,-0.187029,0.112712,-0.340618
Aryavarta,-1.819753,-0.160073,-0.172333


In [30]:
df.sort_index() # across rows, sorts lexicographically

Unnamed: 0,a,b,c
Aryavarta,-1.819753,-0.160073,-0.172333
Bharat,-2.397443,1.319416,-0.360726
India,-1.150834,0.766867,0.67049
Indraprastha,-0.187029,0.112712,-0.340618


In [32]:
# To sort in descending order we make: "ascending=False"
df.sort_index(axis="columns", ascending=False) # across column

Unnamed: 0,c,b,a
India,0.67049,0.766867,-1.150834
Bharat,-0.360726,1.319416,-2.397443
Indraprastha,-0.340618,0.112712,-0.187029
Aryavarta,-0.172333,-0.160073,-1.819753


In [38]:
# To sort by values: sort_values method
ser = pd.Series([-1,-9,7,5,np.nan]) # any NaN values are sorted to the end
ser.sort_values()

1   -9.0
0   -1.0
3    5.0
2    7.0
4    NaN
dtype: float64

In [40]:
# To place the missing value to the starting
ser.sort_values(na_position="first")

4    NaN
1   -9.0
0   -1.0
3    5.0
2    7.0
dtype: float64

In [42]:
# In dataframe sorting we can sort only one column
df

Unnamed: 0,a,b,c
India,-1.150834,0.766867,0.67049
Bharat,-2.397443,1.319416,-0.360726
Indraprastha,-0.187029,0.112712,-0.340618
Aryavarta,-1.819753,-0.160073,-0.172333


In [46]:
df.sort_values("c")

Unnamed: 0,a,b,c
Bharat,-2.397443,1.319416,-0.360726
Indraprastha,-0.187029,0.112712,-0.340618
Aryavarta,-1.819753,-0.160073,-0.172333
India,-1.150834,0.766867,0.67049


In [47]:
df.sort_values(["c","b"])

Unnamed: 0,a,b,c
Bharat,-2.397443,1.319416,-0.360726
Indraprastha,-0.187029,0.112712,-0.340618
Aryavarta,-1.819753,-0.160073,-0.172333
India,-1.150834,0.766867,0.67049


In [50]:
# Ranking assigns the rank from 1.0 to no. of valid points
ser = pd.Series([2,-1,3,2,1]) #when similiar values occur their "ranks average" is assigned and count rank is skipped
ser.rank()

0    4.0
1    1.0
2    6.0
3    4.0
4    2.0
5    4.0
dtype: float64

In [52]:
# to assign rank acc to the order they have appeared instead of average
ser.rank(method="first") 
# For decending: ,asending = false

0    3.0
1    1.0
2    6.0
3    4.0
4    2.0
5    5.0
dtype: float64

In [55]:
df.rank(axis="columns")

Unnamed: 0,a,b,c
India,1.0,3.0,2.0
Bharat,1.0,3.0,2.0
Indraprastha,2.0,3.0,1.0
Aryavarta,1.0,3.0,2.0


Axis Indexes with Duplicate Labels

In [57]:
# series with duplicate indexes
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [58]:
# is_unique property allows us to find if given series/df have unique indexes or duplicate ones
obj.index.is_unique

False

In [60]:
obj["a"] # data selection is a series/dataFrame when duplicates are present

a    0
a    1
dtype: int32

Summarizing and Computing Descriptive Statistics

In [61]:
# Pandas way of dealing with NaN
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],
                  index=["a", "b", "c", "d"],
                  columns=["one", "two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [62]:
df.sum() #NaN values are computed to 0 

one    9.25
two   -5.80
dtype: float64

In [63]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [65]:
# if all values are NaN then corresponding(here sum) result is 0.
# some functions like mean needs atleast one non-NaN to yeild answer nhi to ye NaN output krta h

In [66]:
# by default "skipna=True", means we skip NaN values.
# "skipna=False" even one value in row/col is NaN then corresponding result is NaN
df.sum(axis=1,skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [67]:
# idxmax/idxmin: returns index value where max/min values are attained
df.idxmax()

one    b
two    d
dtype: object

In [68]:
df.idxmin

<bound method DataFrame.idxmin of     one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3>

In [69]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [70]:
df.cumsum

<bound method DataFrame.cumsum of     one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3>

In [71]:
# descibe produce multiple statics
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [73]:
# in non-numeric data, describe produces alternative summary statistics
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj.describe()
# Others: https://wesmckinney.com/book/pandas-basics#tbl-table_descriptive_stats

count     16
unique     3
top        a
freq       8
dtype: object

Correlation and Covariance

In [78]:
# corr computes the correlation
# cov computes the covariance
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [85]:
df.iloc[1].corr(df.iloc[3])

0.9999999999999999

In [88]:
df.iloc[1].cov(df.iloc[3])

11.889999999999999

In [90]:
df.corr()

Unnamed: 0,one,two
one,1.0,-1.0
two,-1.0,1.0


In [91]:
df.cov()

Unnamed: 0,one,two
one,12.205833,-10.16
two,-10.16,5.12


In [98]:
# corrwith method, compute pair-wise correlations between a DataFrame’s columns or rows with another Series or DataFrame
df.corrwith(df["one"])

one    1.0
two   -1.0
dtype: float64

In [101]:
# unique: gives you an array of the unique values in a Series. not neccesarily appear in sorted/in which they appear
ser = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
uniques = ser.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [102]:
# value_counts(): value frequencies(konsi value kitne baar present h)
ser.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [114]:
# value counts for df
data = pd.DataFrame({"Qu1": [1, 3, 4, 3, 4],"Qu2": [2, 3, 1, 2, 3],"Qu3": [1, 5, 2, 4, 4]})
data.apply(pd.value_counts).fillna(0)
# data.value_counts()

  data.apply(pd.value_counts).fillna(0)


Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [105]:
# isin(): checks for value id present or not
obj.isin(["b", "c"])

0     False
1     False
2      True
3      True
4     False
5     False
6      True
7      True
8     False
9     False
10     True
11     True
12    False
13    False
14     True
15     True
dtype: bool

In [107]:
# indexes ko bhi compare kr sakte
to_match = pd.Series(["c", "a", "b", "b", "c", "a"])
unique_vals = pd.Series(["c", "b", "a"])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)