In [1]:
#Handling null data
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from numpy.random import randn, rand

#importing the necessary libraaries

In [2]:
series = Series([1,2,3,4,np.nan])
series

0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
dtype: float64

In [3]:
#to check for  null values
series.isna().value_counts()

False    4
True     1
dtype: int64

In [4]:
#There is only one null value, to drop this value:
series.dropna()


0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

In [10]:
#The same appplies to dataframes
df = DataFrame([[1,2,3], [4,np.nan,6], [np.nan, np.nan, np.nan]], index = ['a', 'b', 'c'], columns = list('abc'))
df

Unnamed: 0,a,b,c
a,1.0,2.0,3.0
b,4.0,,6.0
c,,,


In [11]:
df.isnull()

Unnamed: 0,a,b,c
a,False,False,False
b,False,True,False
c,True,True,True


In [12]:
df.dropna() #this drops the rows that have Nan values

Unnamed: 0,a,b,c
a,1.0,2.0,3.0


In [13]:
df #displays the dataframe

Unnamed: 0,a,b,c
a,1.0,2.0,3.0
b,4.0,,6.0
c,,,


In [14]:
df.dropna(how = 'all') #setting the parameter how = 'all' is to drop the row where all the column values are Nan

Unnamed: 0,a,b,c
a,1.0,2.0,3.0
b,4.0,,6.0


In [16]:
df

Unnamed: 0,a,b,c
a,1.0,2.0,3.0
b,4.0,,6.0
c,,,


In [18]:
df.dropna(axis = 1) #dropping along the columns

a
b
c


In [19]:
#creating a new dataframe
df_2 = DataFrame([[1,2,3,np.nan], [4,5,6,7], [8,9
,np.nan,np.nan], [12,np.nan,np.nan,np.nan]])
df_2

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,
1,4,5.0,6.0,7.0
2,8,9.0,,
3,12,,,


In [20]:
df_2.dropna(thresh = 3)

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,
1,4,5.0,6.0,7.0


The thresh parameter drops the rows if the datavalue present is less than three real numbers.

In [29]:
#the fillna value fills the null values with zero
df_2.fillna(0)

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,0.0
1,4,5.0,6.0,7.0
2,8,9.0,0.0,0.0
3,12,0.0,0.0,0.0


In [32]:
#to fill values based on the specified columns, you have to use dictionary
df_2.fillna({0:900, 2:10000, 3:59990})

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,59990.0
1,4,5.0,6.0,7.0
2,8,9.0,10000.0,59990.0
3,12,,10000.0,59990.0


In [34]:
df_2.columns.values

array([0, 1, 2, 3], dtype=int64)

**Selecting and modifying entries**


In [35]:
series1 = Series([100,200,300], index = ['A','B','C'])
series1

A    100
B    200
C    300
dtype: int64

In [37]:
#to access the corresponding index of the series
series1['A']

100

In [38]:
#To access multiple elements of a series
series1[['A', 'B']]

A    100
B    200
dtype: int64

In [40]:
series1[:2]

A    100
B    200
dtype: int64

In [43]:
#conditional indexes
series1[series1 >= 150]

B    200
C    300
dtype: int64

In [45]:
series1[series1 == 300]

C    300
dtype: int64

In [46]:
#accessing in dataframes
aff = DataFrame(rand(5,5), index = list('abcde'), columns = list('defgh'))
aff

Unnamed: 0,d,e,f,g,h
a,0.076423,0.511554,0.00353,0.23962,0.415846
b,0.856815,0.547743,0.248673,0.187238,0.209304
c,0.643101,0.791435,0.419886,0.137632,0.419946
d,0.634462,0.839874,0.257267,0.489041,0.466611
e,0.226367,0.295296,0.180388,0.645439,0.624693


In [47]:
aff['h']

a    0.415846
b    0.209304
c    0.419946
d    0.466611
e    0.624693
Name: h, dtype: float64

In [48]:
aff[['h', 'g']] #accesing multiple values in a dataframe

Unnamed: 0,h,g
a,0.415846,0.23962
b,0.209304,0.187238
c,0.419946,0.137632
d,0.466611,0.489041
e,0.624693,0.645439


In [49]:
aff>0.5 #conditional accessing

Unnamed: 0,d,e,f,g,h
a,False,True,False,False,False
b,True,True,False,False,False
c,True,True,False,False,False
d,True,True,False,False,False
e,False,False,False,True,True


In [50]:
aff[aff>0.5]

Unnamed: 0,d,e,f,g,h
a,,0.511554,,,
b,0.856815,0.547743,,,
c,0.643101,0.791435,,,
d,0.634462,0.839874,,,
e,,,,0.645439,0.624693


In [52]:
aff.iloc[:, :3]

Unnamed: 0,d,e,f
a,0.076423,0.511554,0.00353
b,0.856815,0.547743,0.248673
c,0.643101,0.791435,0.419886
d,0.634462,0.839874,0.257267
e,0.226367,0.295296,0.180388
