In [2]:
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

In [3]:
data = Series(['1', '2', np.nan, '10'])
data

0      1
1      2
2    NaN
3     10
dtype: object

In [4]:
data.isnull()                                           #Checking if there are any null values

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
data.dropna()                                           #Removing null values

0     1
1     2
3    10
dtype: object

In [7]:
df = DataFrame([[2, 4, 6], [np.nan, 12, 14], [18, np.nan, 20], [np.nan, np.nan, np.nan]])
df

Unnamed: 0,0,1,2
0,2.0,4.0,6.0
1,,12.0,14.0
2,18.0,,20.0
3,,,


In [11]:
clean_df = df.dropna()
clean_df

Unnamed: 0,0,1,2
0,2.0,4.0,6.0


In [14]:
df.dropna(how = 'all')

Unnamed: 0,0,1,2
0,2.0,4.0,6.0
1,,12.0,14.0
2,18.0,,20.0


In [15]:
df.dropna(axis = 1)

0
1
2
3


In [128]:
npn = np.nan
df2 = DataFrame([[3, 6, 9, npn], [12, 15, 18, 21], [npn, 21, npn, 24], [1, npn, npn, npn]])
df2

Unnamed: 0,0,1,2,3
0,3.0,6.0,9.0,
1,12.0,15.0,18.0,21.0
2,,21.0,,24.0
3,1.0,,,


In [129]:
df2.dropna(thresh = 2)                       #Drops any row without AT LEAST 2 null points

Unnamed: 0,0,1,2,3
0,3.0,6.0,9.0,
1,12.0,15.0,18.0,21.0
2,,21.0,,24.0


In [130]:
df2.dropna(thresh = 3)

Unnamed: 0,0,1,2,3
0,3.0,6.0,9.0,
1,12.0,15.0,18.0,21.0


In [131]:
df2.fillna('x')                                                #Not permanently changed -> Not assigned to a variable

Unnamed: 0,0,1,2,3
0,3,6,9,x
1,12,15,18,21
2,x,21,x,24
3,1,x,x,x


In [132]:
df2.fillna({0:'A', 1:'B', 2:'C', 3:'D'})                       #Axis = 0 is the default value

Unnamed: 0,0,1,2,3
0,3,6,9,D
1,12,15,18,21
2,A,21,C,24
3,1,B,C,D


In [136]:
df2 = df2.fillna({0:'A', 1:'B', 2:'C', 3:'D'})                 #Now 'df2' is permanently changed -> Assigned to variable 'df2'
df2                           

Unnamed: 0,0,1,2,3
0,3,6,9,D
1,12,15,18,21
2,A,21,C,24
3,1,B,C,D


In [134]:
df2

Unnamed: 0,0,1,2,3
0,3,6,9,D
1,12,15,18,21
2,A,21,C,24
3,1,B,C,D


In [32]:
#Index hierarchy
from numpy.random import randn

In [92]:
ser = Series(randn(10), index = [[1, 1, 1, 1, 1, 2, 2, 2, 2, 2], ['a', 'b' , 'c', 'd', 'e', 'a', 'b', 'c', 'd', 'e', ]])
ser

1  a   -0.004966
   b   -0.052718
   c   -1.357634
   d    0.011641
   e   -1.083691
2  a    0.351138
   b   -0.829573
   c    1.119131
   d   -1.292881
   e    0.017762
dtype: float64

In [93]:
ser.index

MultiIndex(levels=[[1, 2], ['a', 'b', 'c', 'd', 'e']],
           labels=[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1], [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]])

In [94]:
print(ser[1])
print(ser[2])

a   -0.004966
b   -0.052718
c   -1.357634
d    0.011641
e   -1.083691
dtype: float64
a    0.351138
b   -0.829573
c    1.119131
d   -1.292881
e    0.017762
dtype: float64


In [95]:
print(ser[:, 'a'])
print(ser[:, 'e'])

1   -0.004966
2    0.351138
dtype: float64
1   -1.083691
2    0.017762
dtype: float64


In [96]:
df3 = ser.unstack()                                                     #Creating a DataFrame using Series index
df3

Unnamed: 0,a,b,c,d,e
1,-0.004966,-0.052718,-1.357634,0.011641,-1.083691
2,0.351138,-0.829573,1.119131,-1.292881,0.017762


In [112]:
df4 = DataFrame(np.arange(16).reshape(4,4), index = [['a', 'b', 'a', 'b'], ['10', '20', '10', '20']],
                columns = [['LA', 'NYC', 'SF', 'PHX'], ['Good', 'Great', 'Good', 'Great']])
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,LA,NYC,SF,PHX
Unnamed: 0_level_1,Unnamed: 1_level_1,Good,Great,Good,Great
a,10,0,1,2,3
b,20,4,5,6,7
a,10,8,9,10,11
b,20,12,13,14,15


In [113]:
df4.index.names = ['INDEX1', 'INDEX2']                                    #Naming indexes & columns
df4.columns.names = ['CITIES', 'RATINGS']
df4

Unnamed: 0_level_0,CITIES,LA,NYC,SF,PHX
Unnamed: 0_level_1,RATINGS,Good,Great,Good,Great
INDEX1,INDEX2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,10,0,1,2,3
b,20,4,5,6,7
a,10,8,9,10,11
b,20,12,13,14,15


In [114]:
df4.swaplevel('INDEX1', 'INDEX2', axis = 0)

Unnamed: 0_level_0,CITIES,LA,NYC,SF,PHX
Unnamed: 0_level_1,RATINGS,Good,Great,Good,Great
INDEX2,INDEX1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
10,a,0,1,2,3
20,b,4,5,6,7
10,a,8,9,10,11
20,b,12,13,14,15


In [120]:
df4.sort_index(level = 1)                                   #Sorts rows

Unnamed: 0_level_0,CITIES,LA,NYC,SF,PHX
Unnamed: 0_level_1,RATINGS,Good,Great,Good,Great
INDEX1,INDEX2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,10,0,1,2,3
a,10,8,9,10,11
b,20,4,5,6,7
b,20,12,13,14,15


In [118]:
df4.sort_index(level = 0)                                  #Sorts columns

Unnamed: 0_level_0,CITIES,LA,NYC,SF,PHX
Unnamed: 0_level_1,RATINGS,Good,Great,Good,Great
INDEX1,INDEX2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
a,10,0,1,2,3
a,10,8,9,10,11
b,20,4,5,6,7
b,20,12,13,14,15


In [117]:
df4.sum(level = 'RATINGS', axis = 1)

Unnamed: 0_level_0,RATINGS,Good,Great
INDEX1,INDEX2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,10,2,4
b,20,10,12
a,10,18,20
b,20,26,28
