Using Pandas with Numpy

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'first': ['Shin','Doreamon','Mottu','Tom',np.nan,None,'NA'],
    'last' : ['chan','Nobita','Pattlu','Jerry',np.nan,np.nan,'Missing'] ,
    'email': ['shinchan@gmail.com','doreamon@gmail.com','mottu@gmail.com',None , np.nan,'some@gmail.com','NA'] ,
    'Age'  : ['5','9','6','7',None , None,'Missing']
}

In [3]:
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6
3,Tom,Jerry,,7
4,,,,
5,,,some@gmail.com,
6,,Missing,,Missing


In [5]:
df.dropna()

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6
6,,Missing,,Missing


In [6]:
#### default

df.dropna(axis='index' , how='any')

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6
6,,Missing,,Missing


In [7]:
df.dropna(axis='index' , how='all')

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6
3,Tom,Jerry,,7
5,,,some@gmail.com,
6,,Missing,,Missing


In [8]:
df.dropna(axis='columns' , how='any')

0
1
2
3
4
5
6


In [9]:
df.dropna(axis='columns' , how='all')

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6
3,Tom,Jerry,,7
4,,,,
5,,,some@gmail.com,
6,,Missing,,Missing


In [10]:
df.dropna(axis='index' , how='any', subset = ['email'])

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6
5,,,some@gmail.com,
6,,Missing,,Missing


In [11]:
df.dropna(axis='columns' , how='any' , subset = [5])

Unnamed: 0,email
0,shinchan@gmail.com
1,doreamon@gmail.com
2,mottu@gmail.com
3,
4,
5,some@gmail.com
6,


In [12]:
#### if i use 'any' keyword to 'how' it will remove the column [or index(--> according to given)] which has None or NaN

df.dropna(axis='index' , how='any', subset = ['last','email'])

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6
6,,Missing,,Missing


In [13]:
#### if i use 'all' keyword to 'how' it will remove the column [or index(--> according to given)]which is mentioned as NaN None in the given subset

df.dropna(axis='index' , how='all', subset = ['last','email'])

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6
3,Tom,Jerry,,7
5,,,some@gmail.com,
6,,Missing,,Missing


In [14]:
df.replace('NA',np.nan,inplace=True)
df.replace('Missing',np.nan,inplace=True)

In [15]:
df

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5.0
1,Doreamon,Nobita,doreamon@gmail.com,9.0
2,Mottu,Pattlu,mottu@gmail.com,6.0
3,Tom,Jerry,,7.0
4,,,,
5,,,some@gmail.com,
6,,,,


In [16]:
df.dropna()

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6


In [17]:
df.isna()

Unnamed: 0,first,last,email,Age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [18]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5
1,Doreamon,Nobita,doreamon@gmail.com,9
2,Mottu,Pattlu,mottu@gmail.com,6
3,Tom,Jerry,MISSING,7
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,some@gmail.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [19]:
df.dtypes

first    object
last     object
email    object
Age      object
dtype: object

In [20]:
type(np.nan)

float

In [21]:
type(df['Age'])

pandas.core.series.Series

In [22]:
df['Age'] = df['Age'].astype(float)

In [23]:
df.dtypes

first     object
last      object
email     object
Age      float64
dtype: object

In [24]:
df['Age'].mean()

6.75

In [25]:
df.astype(str)          ##### This help to chng the whole datatypes of dataframe....(but here we can avoid cuz we have mixed columns)

Unnamed: 0,first,last,email,Age
0,Shin,chan,shinchan@gmail.com,5.0
1,Doreamon,Nobita,doreamon@gmail.com,9.0
2,Mottu,Pattlu,mottu@gmail.com,6.0
3,Tom,Jerry,,7.0
4,,,,
5,,,some@gmail.com,
6,,,,
