# Detecting and Filtring Outliers

In [20]:
import numpy as np
import pandas as pd
data = pd.DataFrame(np.random.randn(1000,4))
data

Unnamed: 0,0,1,2,3
0,-0.093739,0.974169,-1.281681,0.053438
1,-0.455338,-0.138183,-2.755917,-1.261144
2,-0.785848,0.227128,-1.733175,-1.036422
3,-0.835041,-0.229679,0.060045,1.785553
4,-0.174291,1.125869,0.060332,1.394722
5,0.688984,1.834316,-0.615416,1.411104
6,-0.613603,-1.077562,0.200132,-0.106402
7,1.132071,0.612086,-0.226160,-1.270938
8,1.137712,1.196686,-1.176821,-0.667388
9,-1.816634,-0.738539,0.473185,0.913239


In [21]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.064162,-0.021467,-0.048984,-0.06003
std,1.021014,1.021416,1.008645,1.037858
min,-3.154317,-2.981095,-3.437337,-3.47839
25%,-0.625036,-0.692572,-0.723909,-0.729096
50%,0.080384,-0.031425,-0.022031,-0.077059
75%,0.744082,0.714321,0.663552,0.630288
max,3.587284,3.404492,2.746814,3.768711


In [22]:
col = data[2]
col[np.abs(col) > 3]

390   -3.023819
415   -3.065116
804   -3.437337
Name: 2, dtype: float64

In [26]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
86,0.199201,0.111445,-0.815087,-3.01163
237,0.735792,0.042585,0.491915,3.235168
364,3.316262,0.556505,0.134485,-1.558876
390,0.347322,0.14241,-3.023819,-0.205387
415,0.166619,1.990423,-3.065116,0.812612
430,3.160631,-1.767062,-1.674482,-1.202358
538,3.352829,0.683239,-0.301908,-1.534508
558,0.654702,-0.242483,0.184242,3.247403
655,-3.154317,-1.18884,-0.492331,-0.874631
804,0.377463,0.571001,-3.437337,-0.480391


In [15]:
np.sign(data) * 3

Unnamed: 0,0,1,2,3
0,3.0,-3.0,-3.0,-3.0
1,-3.0,3.0,3.0,3.0
2,-3.0,-3.0,-3.0,3.0
3,3.0,3.0,3.0,-3.0
4,-3.0,-3.0,-3.0,3.0
5,3.0,-3.0,3.0,3.0
6,3.0,3.0,-3.0,-3.0
7,-3.0,-3.0,-3.0,-3.0
8,-3.0,-3.0,3.0,3.0
9,-3.0,3.0,3.0,3.0


In [16]:
data [np.abs(data) > 3] = np.sign(data) * 3 #any value thats greater than 3 will be replaced by 3

In [17]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.034615,-0.019452,0.031773,-0.015347
std,1.013193,0.996488,1.026993,0.967375
min,-3.0,-3.0,-3.0,-2.724123
25%,-0.643153,-0.695519,-0.663419,-0.652548
50%,0.03969,-0.053244,0.042847,-0.022883
75%,0.738862,0.662544,0.717964,0.625691
max,3.0,3.0,2.925055,3.0


In [27]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,1.0
1,-1.0,-1.0,-1.0,-1.0
2,-1.0,1.0,-1.0,-1.0
3,-1.0,-1.0,1.0,1.0
4,-1.0,1.0,1.0,1.0


# Permutation and Random Sampling

In [29]:
df = pd.DataFrame(np.arange(5*4).reshape((5,4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [44]:
sampler = np.random.permutation(5)
sampler

array([1, 3, 2, 0, 4])

In [46]:
df.take(sampler) #shuffles the rows of df

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3
4,16,17,18,19


In [79]:
df.sample(n=3) # returns 3 random samples from df

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3


# Computing Indicator / Dummy Variables

In [80]:
df = pd.DataFrame({'key' : ['b', 'b', 'a', 'c', 'a', 'b'], 'data1' : range(6)})

df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [86]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [87]:
dummies = pd.get_dummies(df['key'], prefix = 'key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [88]:
df_with_dummie = df[['data1']].join(dummies)
df_with_dummie

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


# String Manuplation

In [96]:
val = "a,b, guide"
val

'a,b, guide'

In [90]:
val.split(',') # split the dat and place , b/w 

['a', 'b', ' guide']

In [91]:
pieces = [x.strip() for x in val.split(',')] # removing sapce b/w ''
pieces

['a', 'b', 'guide']

In [92]:
first , second , third = pieces 
first + '::' + second + '::' + third

'a::b::guide'

In [93]:
val.replace(',' , '::')

'a::b:: guide'

# Locating Substring

In [100]:
val = "a,b, guide"
'guide' in val

True

In [101]:
'f' in val

False

In [102]:
print(val)

a,b, guide


In [106]:
val.index(',')

1

In [110]:
val.index('c')

ValueError: substring not found

In [111]:
val.find('c') #if val is not found returns -1 instead of throwing exception

-1

In [112]:
val.find('b') #returns index

2

# Regular Expressions (RE)

In [115]:
import re
text = 'foo bar\t baz \tqux' # \t = space
re.split('\s+', text) #split the text based on spaces, + indicates multiple spaces to be treated as single space

['foo', 'bar', 'baz', 'qux']

In [117]:
regex = re.compile('\s+') #saving the RE pattern in a variable
regex.split(text) #split the text according to regex

['foo', 'bar', 'baz', 'qux']

In [118]:
regex.findall(text)

[' ', '\t ', ' \t']

In [133]:
info = "My Name is Mehmood and my CNIC number is 42201-5000601-5"
pattern = r'([0-9]{5}-[-0-9]{7}-[0-9]{1})'
CNIC = re.compile(pattern, flags =re.IGNORECASE)
CNIC.findall(info)

['42201-5000601-5']