# Handling Missing Data

Filtering out Missing Data

1 -isnull
2 -notnull
3- dropna

In [84]:
# Filtering out Missing Data:

In [85]:
import numpy as np
import pandas as pd

In [86]:
string_data = pd.Series(['aardvark', 'artichoke',np.nan, None])

In [87]:
string_data.isnull()

0    False
1    False
2     True
3     True
dtype: bool

In [88]:
string_data[string_data.isnull()]

2     NaN
3    None
dtype: object

In [89]:
string_data

0     aardvark
1    artichoke
2          NaN
3         None
dtype: object

In [90]:
string_data.notnull()

0     True
1     True
2    False
3    False
dtype: bool

In [91]:
string_data[string_data.notnull()]

0     aardvark
1    artichoke
dtype: object

In [92]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [93]:
test = data.dropna()
test

0    1.0
2    3.5
4    7.0
dtype: float64

In [94]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [95]:
data.dropna(inplace=True)
data

0    1.0
2    3.5
4    7.0
dtype: float64

In [96]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],[np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [97]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [98]:
data.dropna(axis = 1)

0
1
2
3


In [99]:
data.dropna(axis = 0, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [100]:
# Filling in Missing Data
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,0.274228,-3.273905,0.836252
1,-0.433406,-0.507653,0.181486
2,-2.343382,0.015463,1.004919
3,0.313719,1.239515,0.714325
4,0.576823,1.778408,0.875229
5,-1.000664,0.202333,1.53196
6,0.503071,-1.958094,-1.391976


In [101]:
df.iloc[:4, 1] = np.nan
df

Unnamed: 0,0,1,2
0,0.274228,,0.836252
1,-0.433406,,0.181486
2,-2.343382,,1.004919
3,0.313719,,0.714325
4,0.576823,1.778408,0.875229
5,-1.000664,0.202333,1.53196
6,0.503071,-1.958094,-1.391976


In [102]:
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.274228,,
1,-0.433406,,
2,-2.343382,,1.004919
3,0.313719,,0.714325
4,0.576823,1.778408,0.875229
5,-1.000664,0.202333,1.53196
6,0.503071,-1.958094,-1.391976


In [103]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.274228,0.0,0.0
1,-0.433406,0.0,0.0
2,-2.343382,0.0,1.004919
3,0.313719,0.0,0.714325
4,0.576823,1.778408,0.875229
5,-1.000664,0.202333,1.53196
6,0.503071,-1.958094,-1.391976


In [104]:
df.fillna({1:0.5, 2:0.7})

Unnamed: 0,0,1,2
0,0.274228,0.5,0.7
1,-0.433406,0.5,0.7
2,-2.343382,0.5,1.004919
3,0.313719,0.5,0.714325
4,0.576823,1.778408,0.875229
5,-1.000664,0.202333,1.53196
6,0.503071,-1.958094,-1.391976


# Data Transformation

Removing Duplicated
        
    1.duplicated
    2.drop_duplicated
    
Transforming Data Using a Function or Mapping

           1.map

Replacing values
       
           1.repace

Renaming Axis Indexes

           1. rename
    
Discretization And Binning

          1. cut
Detecting And Filtering Outliers

In [105]:
# Removing Duplicated

In [106]:
data = pd.DataFrame({'k1': ['one', 'two'] *3 + ['two'], 'k2': [1,1,1,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,1
3,two,3
4,one,3
5,two,4
6,two,4


In [107]:
data.duplicated()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
dtype: bool

In [108]:
data[data.duplicated()]

Unnamed: 0,k1,k2
2,one,1
6,two,4


In [109]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
3,two,3
4,one,3
5,two,4


In [110]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,1,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [111]:
data.duplicated(['k1'])

0    False
1    False
2     True
3     True
4     True
5     True
6     True
dtype: bool

In [112]:
data.drop_duplicates(['k1', 'k2'],  keep = 'last')

Unnamed: 0,k1,k2,v1
1,two,1,1
2,one,1,2
3,two,3,3
4,one,3,4
6,two,4,6


In [113]:
#Transforming Data using A Function or Mapping

In [120]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 
                              'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [121]:
meat_to_animal= {'bacon': 'pig', 'pulled pork': 'pig', 'bacon': 'pig' ,'pastrami': 'cow', 'corned beef':'cow', 'hony ham': 'pig', 'nova lox': 'salmon'}


In [122]:
lowercased = data['food'].str.lower()

In [123]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [124]:
data['animal1'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal1
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,
8,nova lox,6.0,salmon


In [125]:
data

Unnamed: 0,food,ounces,animal1
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,
8,nova lox,6.0,salmon


In [126]:
data['animal']= data['food'].map(lambda x : meat_to_animal[x.lower()])# one line code

KeyError: 'honey ham'

In [127]:
# Replacig values

In [129]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [130]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [131]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [132]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [133]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [134]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

# Discretization and Binning

In [135]:
ages =[20, 22, 25, 27, 21, 37, 31, 61, 45, 41, 32]

In [136]:
bins = [18, 25, 35, 60, 100]

In [137]:
cats = pd.cut(ages, bins)

In [138]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 11
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [139]:
cats.codes

array([0, 0, 0, 1, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [140]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [141]:
pd.value_counts(cats)

(18, 25]     4
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

# Detecting and Filtering outliers


In [142]:
data = pd.DataFrame(np.random.randn(1000,4))

In [143]:
data.head(20)

Unnamed: 0,0,1,2,3
0,0.836693,0.091614,-1.780452,0.824216
1,0.701048,-0.328678,-0.545548,1.920707
2,-0.584275,0.674557,-0.099361,0.179497
3,-0.910676,0.384085,2.093288,2.28466
4,1.09239,-0.444024,2.038663,-2.397056
5,-0.952872,-1.256704,-0.504284,0.213812
6,-0.008987,0.363893,-0.760053,-0.349349
7,0.430233,-0.938893,-1.153373,-0.788969
8,-0.283824,-0.884075,1.092247,-0.044129
9,-0.148586,1.17384,1.554136,0.925818


In [144]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.027461,0.007676,-0.005906,0.038729
std,1.047036,1.033702,0.975343,0.980753
min,-3.131381,-3.426417,-2.915286,-3.013012
25%,-0.712664,-0.661576,-0.735093,-0.628181
50%,0.018208,0.055208,-0.028285,0.042175
75%,0.776729,0.673019,0.66071,0.653283
max,3.515974,3.313023,3.542548,2.912986


In [145]:
col = data[1]

In [146]:
col.head(15)

0     0.091614
1    -0.328678
2     0.674557
3     0.384085
4    -0.444024
5    -1.256704
6     0.363893
7    -0.938893
8    -0.884075
9     1.173840
10   -0.317884
11    1.142456
12    0.368107
13   -0.132004
14   -0.099741
Name: 1, dtype: float64

In [147]:
col[np.abs(col)> 3]

66    -3.105967
198   -3.022743
499   -3.426417
500    3.017067
572    3.094989
831   -3.016281
884    3.313023
901   -3.253721
Name: 1, dtype: float64

In [149]:
np.sign(data)

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,1.0,-1.0,-1.0,1.0
2,-1.0,1.0,-1.0,1.0
3,-1.0,1.0,1.0,1.0
4,1.0,-1.0,1.0,-1.0
...,...,...,...,...
995,1.0,1.0,-1.0,-1.0
996,-1.0,1.0,-1.0,1.0
997,-1.0,1.0,1.0,-1.0
998,-1.0,1.0,-1.0,-1.0


In [150]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
66,-0.70846,-3.105967,2.402684,-0.634572
198,-1.478342,-3.022743,-0.404787,0.380532
310,3.34165,-0.120281,-0.213631,0.473708
335,-0.309063,1.085752,3.542548,2.114587
393,1.100847,-1.041369,3.234954,-0.405145
499,-1.354936,-3.426417,1.225515,0.113692
500,2.28519,3.017067,0.066265,0.092379
572,0.202133,3.094989,1.010496,0.755624
580,3.157759,1.396914,1.315677,1.118378
594,-3.131381,-1.103891,0.720536,-0.252114


In [151]:
data[np.abs(data) >3] = np.sign(data)*3

In [152]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.02662,0.008076,-0.006683,0.038742
std,1.043432,1.029883,0.972762,0.980713
min,-3.0,-3.0,-2.915286,-3.0
25%,-0.712664,-0.661576,-0.735093,-0.628181
50%,0.018208,0.055208,-0.028285,0.042175
75%,0.776729,0.673019,0.66071,0.653283
max,3.0,3.0,3.0,2.912986


# String Manipulation 
Regular Expressions

1.regex


In [153]:
# Regular Expressions

In [None]:
import re

text = " " "Dave dave@google.com
      Steve steve@gmail.com
    Rob  rob@gmail.com
    Ryan ryan@yahoo.com"""
    
pattern = r'[A-ZO-9._+-]+@[A-Z0-9.-]+.[A-Z]{2,4} #text@text.pk
