Handling Missing Data

In [8]:
# Reasons fpr handling missing data:-
# 1. To identify data collection problems
# 2. Potential biases in data caused by missing data

# NaN and None in python is NA:"not applicable" values. These occur when data doesn't exists or is not read

# isna() method returns boolean value indicating which values are missing 

import pandas as pd
import numpy as np

ser = pd.Series([2,np.nan,None,3,4])
ser

0    2.0
1    NaN
2    NaN
3    3.0
4    4.0
dtype: float64

In [9]:
ser.isna()

0    False
1     True
2     True
3    False
4    False
dtype: bool

In [10]:
ser.fillna(value="filled null values") # fill null values with the specified value

0                   2.0
1    filled null values
2    filled null values
3                   3.0
4                   4.0
dtype: object

In [11]:
ser.ffill() # fills the value with prev value

0    2.0
1    2.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [12]:
ser.bfill() # fills the value with next value

0    2.0
1    3.0
2    3.0
3    3.0
4    4.0
dtype: float64

In [13]:
ser.notna() # negation of isna

0     True
1    False
2    False
3     True
4     True
dtype: bool

Filtering Out Missing Data

In [14]:
ser.dropna() # On series it returns the non-null values and indexes

0    2.0
3    3.0
4    4.0
dtype: float64

In [15]:
ser[ser.notna()]

0    2.0
3    3.0
4    4.0
dtype: float64

In [16]:
# In DataFrames dropna() drops any row(not col) containing null values
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],[np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [18]:
# Passing how="all" will drop only rows that are all NA
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [19]:
# for columns pass: axis = "columns"
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [20]:
data.dropna(axis=1, how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
# If we want to mention the certain(number of null) values contained should only be allowed: use "thresh"
data.dropna(thresh=2)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
3,,6.5,3.0,


Filling In Missing Data

In [22]:
# Instead of removing the valuable data using dropna(), we use "fillna(value_jisse_null_replace)"
data.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.5,3.0,0.0


In [23]:
# fillna({}) with dictionary allows us to fill different column's null values with different values
data.fillna({0:"sristi",1:"muskan"})

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,muskan,,
2,sristi,muskan,,
3,sristi,6.5,3.0,


In [24]:
data.fillna(0,limit=1) # limit: kitne null hatana h rows/cols se

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,
2,0.0,,,
3,,6.5,3.0,


In [25]:
data.fillna(data.mean())

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,6.5,3.0,
2,1.0,6.5,3.0,
3,1.0,6.5,3.0,


Removing Duplicates

In [26]:
# duplicated returns a Boolean Series indicating whether each row is a duplicate (its column values are exactly equal to those in an earlier row) or not
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [27]:
data[4].duplicated()

0    False
1     True
2     True
3     True
Name: 4, dtype: bool

In [28]:
data[2].duplicated()

0    False
1    False
2     True
3     True
Name: 2, dtype: bool

In [29]:
data.duplicated()

0    False
1    False
2    False
3    False
dtype: bool

In [30]:
# drop_duplicates return the rows where duplicates = false
data.drop_duplicates()

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [31]:
data.drop_duplicates(subset=[4])

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,


In [32]:
data[3] = np.arange(4)
data

Unnamed: 0,0,1,2,4,3
0,1.0,6.5,3.0,,0
1,1.0,,,,1
2,,,,,2
3,,6.5,3.0,,3


In [33]:
data.drop_duplicates(subset=[3])

Unnamed: 0,0,1,2,4,3
0,1.0,6.5,3.0,,0
1,1.0,,,,1
2,,,,,2
3,,6.5,3.0,,3


In [34]:
data.drop_duplicates(subset=[0])

Unnamed: 0,0,1,2,4,3
0,1.0,6.5,3.0,,0
2,,,,,2


In [35]:
data[4].drop_duplicates(keep="last")

3   NaN
Name: 4, dtype: float64

Transforming Data Using a Function or Mapping

In [36]:
data = pd.DataFrame({"food":["bacon", "pulled pork", "bacon","pastrami", "corned beef", "bacon","pastrami", "honey ham", "nova lox"],
          "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [37]:
# To add data another column, based on the previous col. Here animal names with food
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [38]:
data["animal"] = data["food"].map(meat_to_animal) # using mapping 
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [39]:
def get_animal(x):
    return meat_to_animal[x]

data["animal"] = data["food"].map(get_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


# Replacing Values

In [40]:
ser = pd.Series([1,2,1,21,21,2,1,2,1,2,1,2,12,12,2])

In [41]:
ser.isna()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
dtype: bool

In [42]:
# considering 12 as missing value above, so we want to replace it with nan
ser.replace(12,np.nan)

0      1.0
1      2.0
2      1.0
3     21.0
4     21.0
5      2.0
6      1.0
7      2.0
8      1.0
9      2.0
10     1.0
11     2.0
12     NaN
13     NaN
14     2.0
dtype: float64

In [43]:
data = pd.DataFrame({"A":[0,1,2], "B":[2,4,6]})
data

Unnamed: 0,A,B
0,0,2
1,1,4
2,2,6


In [44]:
data.replace(2,np.nan)

Unnamed: 0,A,B
0,0.0,
1,1.0,4.0
2,,6.0


In [45]:
# multiple replacement
data.replace([1,2], np.nan)

Unnamed: 0,A,B
0,0.0,
1,,4.0
2,,6.0


In [46]:
# replacing with different values
data.replace([1,2],["mm","ss"])

Unnamed: 0,A,B
0,0,ss
1,mm,4
2,ss,6


In [47]:
# specifying replacing values using maps
data.replace({"A":np.nan, 0:"muskan"})

Unnamed: 0,A,B
0,muskan,2
1,1,4
2,2,6


Renaming Axis Indexes

In [48]:
# Like values, indexes can also be modified by a function or mapping
data = pd.DataFrame(np.arange(12).reshape((3,4)), index=["Ohio", "Colorado", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [49]:
def transform(x):
    return x.upper()

data.index = data.index.map(transform)

In [50]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [51]:
data.columns = data.columns.map(transform)

In [52]:
data

Unnamed: 0,ONE,TWO,THREE,FOUR
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [53]:
# If want to create a transformed version of a dataset without modifying the original, a useful method is "rename"
data.rename(index=str.lower, columns=str.lower)

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
new york,8,9,10,11


In [54]:
data # origianl dat aremains unchanged

Unnamed: 0,ONE,TWO,THREE,FOUR
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [55]:
# rename can be used in conjunction with dictionary
data = data.rename({"OHIO":"Muskan", "NEW YORK":"Sristi"})
data # assigning changes the data

Unnamed: 0,ONE,TWO,THREE,FOUR
Muskan,0,1,2,3
COLORADO,4,5,6,7
Sristi,8,9,10,11


Discretization and Binning

In [56]:
#  Suppose you have data about a group of people in a study, and you want to group them into discrete age buckets
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [57]:
# Lets divide these into bins 18 to 25, 26 to 35, 36 to 60, and finally 61 and older. To do so us "pd.cut()"
bins = [18,25,35,60,100]
age_categories = pd.cut(ages,bins)

In [58]:
# so the bins are created as ()parenthesis means not including this value, []sq bracket means including this value
# cut() method cuts ages by creating bins between bins[0] and bin[1], bin[1] and bin[2]...
# the returned list shows the range in which the respective value lies
list(age_categories)

[Interval(18, 25, closed='right'),
 Interval(18, 25, closed='right'),
 Interval(18, 25, closed='right'),
 Interval(25, 35, closed='right'),
 Interval(18, 25, closed='right'),
 Interval(18, 25, closed='right'),
 Interval(35, 60, closed='right'),
 Interval(25, 35, closed='right'),
 Interval(60, 100, closed='right'),
 Interval(35, 60, closed='right'),
 Interval(35, 60, closed='right'),
 Interval(25, 35, closed='right')]

In [59]:
age_categories.codes # 0-> 1st group, 1->2nd group..

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [60]:
age_categories.categories # bin categories created

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [61]:
pd.value_counts(age_categories)

  pd.value_counts(age_categories)


(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
Name: count, dtype: int64

In [62]:
pd.cut(ages,bins,right=False) # (] -> [)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [63]:
data = np.random.standard_normal(10)
data

array([ 1.47388769, -0.70389048,  0.62875029, -1.1083621 , -0.44095829,
        0.02083762, -0.99816144,  0.48797898, -0.88304686, -1.79695871])

In [64]:
# cutting without giving the end-points, Bas mentioning ki kitne intervals banao
quartiles = pd.qcut(data,4)
list(quartiles.categories)

[Interval(-1.7979999999999998, -0.969, closed='right'),
 Interval(-0.969, -0.572, closed='right'),
 Interval(-0.572, 0.371, closed='right'),
 Interval(0.371, 1.474, closed='right')]

In [65]:
pd.value_counts(quartiles)

  pd.value_counts(quartiles)


(-1.7979999999999998, -0.969]    3
(0.371, 1.474]                   3
(-0.969, -0.572]                 2
(-0.572, 0.371]                  2
Name: count, dtype: int64

Detecting and Filtering Outliers

In [71]:
data = pd.DataFrame(np.random.standard_normal((9,3)))
data

Unnamed: 0,0,1,2
0,-0.779771,0.164031,0.72234
1,0.306143,-0.731195,1.010843
2,-2.420167,-0.005691,-0.08065
3,1.532065,-1.588738,0.350389
4,1.15035,1.451945,0.931638
5,0.730338,0.826255,0.436345
6,-1.979011,0.954086,-1.492829
7,0.121101,1.176431,-0.065563
8,-0.458212,2.163214,0.144182


In [75]:
data[data[2] > 0] # filtering on the basis of columns 2

Unnamed: 0,0,1,2
0,-0.779771,0.164031,0.72234
1,0.306143,-0.731195,1.010843
3,1.532065,-1.588738,0.350389
4,1.15035,1.451945,0.931638
5,0.730338,0.826255,0.436345
8,-0.458212,2.163214,0.144182


Random sampling and Permutation

In [76]:
data.take(np.random.permutation(9))

Unnamed: 0,0,1,2
8,-0.458212,2.163214,0.144182
0,-0.779771,0.164031,0.72234
2,-2.420167,-0.005691,-0.08065
7,0.121101,1.176431,-0.065563
4,1.15035,1.451945,0.931638
3,1.532065,-1.588738,0.350389
5,0.730338,0.826255,0.436345
6,-1.979011,0.954086,-1.492829
1,0.306143,-0.731195,1.010843


In [77]:
data.iloc[np.random.permutation(9)]

Unnamed: 0,0,1,2
6,-1.979011,0.954086,-1.492829
2,-2.420167,-0.005691,-0.08065
5,0.730338,0.826255,0.436345
3,1.532065,-1.588738,0.350389
1,0.306143,-0.731195,1.010843
0,-0.779771,0.164031,0.72234
8,-0.458212,2.163214,0.144182
4,1.15035,1.451945,0.931638
7,0.121101,1.176431,-0.065563


In [79]:
# By invoking take with axis="columns", we could also select a permutation of the columns
column_sampler = np.arange(3)
data.take(column_sampler,axis=1)

Unnamed: 0,0,1,2
0,-0.779771,0.164031,0.72234
1,0.306143,-0.731195,1.010843
2,-2.420167,-0.005691,-0.08065
3,1.532065,-1.588738,0.350389
4,1.15035,1.451945,0.931638
5,0.730338,0.826255,0.436345
6,-1.979011,0.954086,-1.492829
7,0.121101,1.176431,-0.065563
8,-0.458212,2.163214,0.144182


In [89]:
data.sample(n=5) # any 5 random rows appears. No replacement of rows

Unnamed: 0,0,1,2
7,0.121101,1.176431,-0.065563
5,0.730338,0.826255,0.436345
8,-0.458212,2.163214,0.144182
4,1.15035,1.451945,0.931638
0,-0.779771,0.164031,0.72234


In [92]:
data.sample(11,replace=True)

Unnamed: 0,0,1,2
4,1.15035,1.451945,0.931638
0,-0.779771,0.164031,0.72234
8,-0.458212,2.163214,0.144182
8,-0.458212,2.163214,0.144182
8,-0.458212,2.163214,0.144182
7,0.121101,1.176431,-0.065563
5,0.730338,0.826255,0.436345
8,-0.458212,2.163214,0.144182
5,0.730338,0.826255,0.436345
2,-2.420167,-0.005691,-0.08065
