In [2]:
# Handling Missing Data, Filling and Replacing Values, Removing Duplicates, Detecting and Removing Outliers. Decision Trees (ch 6) 


### Handling Missing Data 

In [3]:


# Creating a Series with Missing Data 

import pandas as pd
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object


In [4]:
# .isnull() returns a boolean Series indicating which values are missing. 
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data[0]=None
print(string_data)

0         None
1    artichoke
2          NaN
3      avocado
dtype: object


In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [7]:
# Filtering Out Missing Data 
from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [8]:
# .dropna() returns a new Series with the NaN values removed. 
print(data.dropna())

0    1.0
2    3.5
4    7.0
dtype: float64


In [9]:
# data[data.notnull()] achieves the same result by filtering out NaN values using boolean indexing.

print(data[data.notnull()])

0    1.0
2    3.5
4    7.0
dtype: float64


In [10]:
# using dropna() on dataframe 

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
print(data)
print("clean data")
cleaned = data.dropna()
print(cleaned)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
clean data
     0    1    2
0  1.0  6.5  3.0


In [11]:
# Drop rows with all NA values
print(data.dropna(how='all'))

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0


In [12]:
# Drop columns with all NA values
data[4] = NA
print(data)

print("\ndrop the column with all NaN")
print(data.dropna(axis=1, how='all'))

     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  NaN  NaN  NaN NaN
3  NaN  6.5  3.0 NaN

drop the column with all NaN
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


In [13]:
# Using thresh Argument 
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
print(df)

          0         1         2
0 -0.343480       NaN       NaN
1  2.055216       NaN       NaN
2 -1.032309       NaN  1.094308
3 -0.289346       NaN  2.358526
4 -0.702229  1.932726 -0.380210
5 -0.287307 -0.651143  1.279905
6 -0.194770 -0.367476 -0.553625


In [14]:
# df.dropna() removes rows that contain any NaN values. 

print(df.dropna())

          0         1         2
4 -0.702229  1.932726 -0.380210
5 -0.287307 -0.651143  1.279905
6 -0.194770 -0.367476 -0.553625


In [15]:
# df.dropna(thresh=2) keeps rows with at least 2 non-NA values.
print(df.dropna(thresh=2))

          0         1         2
2 -1.032309       NaN  1.094308
3 -0.289346       NaN  2.358526
4 -0.702229  1.932726 -0.380210
5 -0.287307 -0.651143  1.279905
6 -0.194770 -0.367476 -0.553625


### Filling In Missing Data

In [16]:
df

Unnamed: 0,0,1,2
0,-0.34348,,
1,2.055216,,
2,-1.032309,,1.094308
3,-0.289346,,2.358526
4,-0.702229,1.932726,-0.38021
5,-0.287307,-0.651143,1.279905
6,-0.19477,-0.367476,-0.553625


In [17]:
# df.fillna(0) fills all NaN values in the DataFrame with 0.
print(df.fillna(0))

          0         1         2
0 -0.343480  0.000000  0.000000
1  2.055216  0.000000  0.000000
2 -1.032309  0.000000  1.094308
3 -0.289346  0.000000  2.358526
4 -0.702229  1.932726 -0.380210
5 -0.287307 -0.651143  1.279905
6 -0.194770 -0.367476 -0.553625


In [18]:
# Using fillna with Dictionary
# df.fillna({1: 0.5, 2: 0}) fills NaN values in column 1 with 0.5 and in column 2 with 0.
print(df.fillna({1: 0.5, 2: 0}))

          0         1         2
0 -0.343480  0.500000  0.000000
1  2.055216  0.500000  0.000000
2 -1.032309  0.500000  1.094308
3 -0.289346  0.500000  2.358526
4 -0.702229  1.932726 -0.380210
5 -0.287307 -0.651143  1.279905
6 -0.194770 -0.367476 -0.553625


In [19]:
# fillna returns a new object, but you can modify the existing object in-place
df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,-0.34348,0.0,0.0
1,2.055216,0.0,0.0
2,-1.032309,0.0,1.094308
3,-0.289346,0.0,2.358526
4,-0.702229,1.932726,-0.38021
5,-0.287307,-0.651143,1.279905
6,-0.19477,-0.367476,-0.553625


In [20]:
# The same interpolation methods available for reindexing can be used with fillna 
df = pd.DataFrame(np.random.randn(6, 3))
df

Unnamed: 0,0,1,2
0,0.168049,-0.557199,0.330249
1,0.302037,1.290678,1.157875
2,0.250214,-1.020142,1.568916
3,-1.070397,-1.128614,-0.079492
4,-1.528381,0.333021,-0.584249
5,-0.840627,-0.271966,-1.275743


In [21]:
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.168049,-0.557199,0.330249
1,0.302037,1.290678,1.157875
2,0.250214,,1.568916
3,-1.070397,,-0.079492
4,-1.528381,,
5,-0.840627,,


In [22]:
df.fillna(method="ffill")

  df.fillna(method="ffill")


Unnamed: 0,0,1,2
0,0.168049,-0.557199,0.330249
1,0.302037,1.290678,1.157875
2,0.250214,1.290678,1.568916
3,-1.070397,1.290678,-0.079492
4,-1.528381,1.290678,-0.079492
5,-0.840627,1.290678,-0.079492


In [23]:
df

Unnamed: 0,0,1,2
0,0.168049,-0.557199,0.330249
1,0.302037,1.290678,1.157875
2,0.250214,,1.568916
3,-1.070397,,-0.079492
4,-1.528381,,
5,-0.840627,,


In [24]:
df.fillna(method="ffill", limit=2)

  df.fillna(method="ffill", limit=2)


Unnamed: 0,0,1,2
0,0.168049,-0.557199,0.330249
1,0.302037,1.290678,1.157875
2,0.250214,1.290678,1.568916
3,-1.070397,1.290678,-0.079492
4,-1.528381,,-0.079492
5,-0.840627,,-0.079492


In [25]:
# you might pass the mean or median value of a Series 
data = pd.Series([1., NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [26]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### Removing Duplicates

In [27]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [28]:
# duplicated returns a boolean Series indicating whether each row is a duplicate or not 
# false means row is duplicated
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [29]:
# drop_duplicates returns a DataFrame where the duplicated array is False 
data.drop_duplicates()    # this will drop row number 6

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [30]:
# Suppose we had an additional column of values and wanted to filter duplicates only based on the 'k1' column 
data["v1"] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [31]:
data.drop_duplicates(["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [32]:
# this code will drop the 5th row and keep the 6th row 
data.drop_duplicates(['k1', 'k2'], keep='last')


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Replacing Values

In [33]:
data=pd.Series([1.,3.,-999.,1000.,-999.,54.])
data

0       1.0
1       3.0
2    -999.0
3    1000.0
4    -999.0
5      54.0
dtype: float64

In [34]:
# The -999 values might be sentinel values for missing data. 
# To replace these with NA values that pandas understands, 
# we can use replace, producing a new Series (unless you pass inplace=True)
data.replace(-999.,np.nan)      # replacing -999 with NaN

0       1.0
1       3.0
2       NaN
3    1000.0
4       NaN
5      54.0
dtype: float64

In [35]:
# If you want to replace multiple values at once, you instead pass a list and then the substitute value
data.replace([1000., -999.], np.nan)   # this line will replace the multiple values with NaN


0     1.0
1     3.0
2     NaN
3     NaN
4     NaN
5    54.0
dtype: float64

In [36]:
# To use a different replacement for each value, pass a list of substitutes:
data.replace([-999,1000], [np.nan, 0])

0     1.0
1     3.0
2     NaN
3     0.0
4     NaN
5    54.0
dtype: float64

In [37]:
# The argument passed can also be a dict:
data.replace({-999: np.nan, 1000: 0})

0     1.0
1     3.0
2     NaN
3     0.0
4     NaN
5    54.0
dtype: float64

### Detecting and Filtering Outliers


In [38]:
data = pd.DataFrame(np.random.randn(1000, 4))
data

Unnamed: 0,0,1,2,3
0,-2.607242,-1.215338,1.356988,-0.451157
1,0.327538,1.251832,1.330372,0.804183
2,1.209200,-0.733369,0.501998,0.632617
3,-1.217274,-0.339831,-2.419640,0.589677
4,0.328899,1.048651,-0.527503,-0.023827
...,...,...,...,...
995,-0.768041,-2.199615,-1.222373,1.280040
996,0.080966,1.971499,-0.039694,1.594289
997,-1.170059,-0.443974,1.380156,-0.224095
998,-0.074136,-1.829489,0.100132,0.622480


In [39]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.027903,0.003723,0.00557,-0.02325
std,0.957798,1.028893,1.031771,1.008258
min,-3.669972,-3.374789,-3.929567,-4.04157
25%,-0.677463,-0.663487,-0.639,-0.706454
50%,-0.006811,-0.001743,-0.021758,-0.024035
75%,0.719049,0.648365,0.663011,0.624554
max,3.189025,4.512684,3.115549,3.241653


In [40]:
data 

Unnamed: 0,0,1,2,3
0,-2.607242,-1.215338,1.356988,-0.451157
1,0.327538,1.251832,1.330372,0.804183
2,1.209200,-0.733369,0.501998,0.632617
3,-1.217274,-0.339831,-2.419640,0.589677
4,0.328899,1.048651,-0.527503,-0.023827
...,...,...,...,...
995,-0.768041,-2.199615,-1.222373,1.280040
996,0.080966,1.971499,-0.039694,1.594289
997,-1.170059,-0.443974,1.380156,-0.224095
998,-0.074136,-1.829489,0.100132,0.622480


In [41]:
data.shape

(1000, 4)

In [42]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
outliers_iqr = data[((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

# Filter out outliers
filtered_data_iqr = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

print("Outliers using IQR:\n", outliers_iqr)



Outliers using IQR:
             0         1         2         3
13  -1.225516 -3.330298 -0.951519  0.089020
59  -0.031298 -3.374789 -0.308987 -0.032663
88   1.309795 -0.218754 -1.122428  3.241653
105 -0.973187 -2.919367  0.401643 -1.017772
108 -0.243975 -0.030809  2.860575 -0.157842
129 -0.673008 -0.155357  2.704569  1.238585
203  0.851486 -0.342885 -3.929567  0.849818
289 -0.570791  0.456939  2.798298  0.876936
296 -1.508316 -3.211147 -0.274318  0.571483
306 -1.062210  0.249381 -0.714377 -3.161370
309 -1.414144  0.153421 -3.115097  0.259151
330 -0.061456  2.628963 -0.383667 -0.550013
367  1.462831  1.415850 -0.420006  2.889854
394  1.497303  4.512684  3.049596 -1.590716
497  0.957707  3.855608 -0.375855  0.093715
522  0.040592  1.317513  2.707816  1.178794
530  0.078794  0.227282  0.118770  2.832549
540  3.189025 -1.624555  0.878531 -0.267063
544 -1.412478  0.166153  0.237691 -4.041570
545 -3.669972 -1.100703  1.064804 -0.154161
565  3.093189  0.801595  0.516619 -0.074643
591 -0.1326

In [43]:
print("\nfiltering outliers\n")
print("\nFiltered Data using IQR:\n", filtered_data_iqr.head())


filtering outliers


Filtered Data using IQR:
           0         1         2         3
0 -2.607242 -1.215338  1.356988 -0.451157
1  0.327538  1.251832  1.330372  0.804183
2  1.209200 -0.733369  0.501998  0.632617
3 -1.217274 -0.339831 -2.419640  0.589677
4  0.328899  1.048651 -0.527503 -0.023827


In [44]:
filtered_data_iqr.shape

(961, 4)

### Decision Tree

In [52]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris=load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [53]:
X = iris.data[:,2:]
y = iris.target

print(X)
print(y)

[[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [1.7 0.4]
 [1.4 0.3]
 [1.5 0.2]
 [1.4 0.2]
 [1.5 0.1]
 [1.5 0.2]
 [1.6 0.2]
 [1.4 0.1]
 [1.1 0.1]
 [1.2 0.2]
 [1.5 0.4]
 [1.3 0.4]
 [1.4 0.3]
 [1.7 0.3]
 [1.5 0.3]
 [1.7 0.2]
 [1.5 0.4]
 [1.  0.2]
 [1.7 0.5]
 [1.9 0.2]
 [1.6 0.2]
 [1.6 0.4]
 [1.5 0.2]
 [1.4 0.2]
 [1.6 0.2]
 [1.6 0.2]
 [1.5 0.4]
 [1.5 0.1]
 [1.4 0.2]
 [1.5 0.2]
 [1.2 0.2]
 [1.3 0.2]
 [1.4 0.1]
 [1.3 0.2]
 [1.5 0.2]
 [1.3 0.3]
 [1.3 0.3]
 [1.3 0.2]
 [1.6 0.6]
 [1.9 0.4]
 [1.4 0.3]
 [1.6 0.2]
 [1.4 0.2]
 [1.5 0.2]
 [1.4 0.2]
 [4.7 1.4]
 [4.5 1.5]
 [4.9 1.5]
 [4.  1.3]
 [4.6 1.5]
 [4.5 1.3]
 [4.7 1.6]
 [3.3 1. ]
 [4.6 1.3]
 [3.9 1.4]
 [3.5 1. ]
 [4.2 1.5]
 [4.  1. ]
 [4.7 1.4]
 [3.6 1.3]
 [4.4 1.4]
 [4.5 1.5]
 [4.1 1. ]
 [4.5 1.5]
 [3.9 1.1]
 [4.8 1.8]
 [4.  1.3]
 [4.9 1.5]
 [4.7 1.2]
 [4.3 1.3]
 [4.4 1.4]
 [4.8 1.4]
 [5.  1.7]
 [4.5 1.5]
 [3.5 1. ]
 [3.8 1.1]
 [3.7 1. ]
 [3.9 1.2]
 [5.1 1.6]
 [4.5 1.5]
 [4.5 1.6]
 [4.7 1.5]
 [4.4 1.3]
 [4.1 1.3]
 [4.  1.3]
 [4.4 1.2]

In [54]:
D_tree = DecisionTreeClassifier(max_depth=2)
D_tree.fit(X,y)

In [60]:
D_tree.predict_proba([[1.4, 0.2]])

array([[1., 0., 0.]])

In [61]:
D_tree.predict([[1.4, 0.2]])

array([0])