In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

In [2]:
df = datasets.load_iris(return_X_y = True, as_frame=True)[0]

In [3]:
# check if any columns have nan values
for item in df:
    print(df[item].isnull().values.any())

False
False
False
False


In [4]:
# Check for datatypes
df.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
dtype: object

In [5]:
# Changing data type to and fro str and float / int
df['petal width (cm)'] = df['petal width (cm)'].astype('str')
df['petal width (cm)'] = df['petal width (cm)'].astype('float')

In [6]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
# Getting the quantiles
quantiles = np.arange(0,1.1,0.1)
df.quantile(quantiles)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0.0,4.3,2.0,1.0,0.1
0.1,4.8,2.5,1.4,0.2
0.2,5.0,2.7,1.5,0.2
0.3,5.27,2.8,1.7,0.4
0.4,5.6,3.0,3.9,1.16
0.5,5.8,3.0,4.35,1.3
0.6,6.1,3.1,4.64,1.5
0.7,6.3,3.2,5.0,1.8
0.8,6.52,3.4,5.32,1.9
0.9,6.9,3.61,5.8,2.2


In [8]:
df['sepal length (cm)'].mean()
df['sepal length (cm)'].median()
df['sepal length (cm)'].min()
df['sepal length (cm)'].max()

7.9

In [9]:
# Multi-criteria filtering
# And
df[(df['sepal length (cm)'] > 5) & (df['sepal width (cm)'] > 3)]
# Or
df[(df['sepal length (cm)'] > 5) | (df['sepal width (cm)'] > 3)]

# Alternatively, if filtering string we can use
# filter_criterions = ['A', 'B', 'C']
# df[df.column.isin(filter_criterions)]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [10]:
# Select only two columns
df[['sepal length (cm)', 'sepal width (cm)']]

Unnamed: 0,sepal length (cm),sepal width (cm)
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


In [11]:
df.fillna(method = 'bfill') # or ffill for forward filling

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [12]:
df.dropna() # axis: 0 = rows, axis: 1 = columns, how: any & all, subset: which columns to drop na

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [13]:
df.isna()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
145,False,False,False,False
146,False,False,False,False
147,False,False,False,False
148,False,False,False,False


In [14]:
# For normal row indexing
# iloc gives you the nth row of data, not the index "n"
print(type(df.iloc[1]))
print(type(df.iloc[[1]]))
print(type(df.iloc[1:10]))
print(type(df.iloc[1:10, 1]))
print(type(df.iloc[1:10, 0:2]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [15]:
# For boolean based indexing or label based indexing we use .loc
df.loc[df['sepal length (cm)'] > 4.5]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
