# Handling Duplicates in a Breast Cancer Dataset

In [1]:
# import packages
import pandas as pd

In [2]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter11/dataset/breast-cancer-wisconsin.data'

In [3]:
# load the data
df = pd.read_csv(url_path, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
# Create a variable called col_names that contains the names of the columns
col_names = ['Sample code number','Clump Thickness','Uniformity of Cell Size',\
             'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size',\
             'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class'] 

In [5]:
# Assign the column names of the DataFrame
df.columns = col_names

In [6]:
# Display the shape of the DataFrame
df.shape

(699, 11)

In [7]:
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [8]:
# Find the number of duplicate rows
df.duplicated().sum()

8

In [9]:
# Display the duplicate rows 
df.loc[df.duplicated()]

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
208,1218860,1,1,1,1,1,1,3,1,1,2
253,1100524,6,10,10,2,8,10,7,3,3,4
254,1116116,9,10,10,1,10,8,3,3,1,4
258,1198641,3,1,1,1,2,1,3,1,1,2
272,320675,3,3,5,2,3,10,7,1,1,4
338,704097,1,1,1,1,1,1,2,1,1,2
561,1321942,5,1,1,1,2,1,3,1,1,2
684,466906,1,1,1,1,2,1,1,1,1,2


In [10]:
# Display the duplicate rows, but with the keep='last' parameter
df.loc[df.duplicated(keep='last')]

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
42,1100524,6,10,10,2,8,10,7,3,3,4
62,1116116,9,10,10,1,10,8,3,3,1,4
168,1198641,3,1,1,1,2,1,3,1,1,2
207,1218860,1,1,1,1,1,1,3,1,1,2
267,320675,3,3,5,2,3,10,7,1,1,4
314,704097,1,1,1,1,1,1,2,1,1,2
560,1321942,5,1,1,1,2,1,3,1,1,2
683,466906,1,1,1,1,2,1,1,1,1,2


In [11]:
# Remove the duplicate rows 
df_unique = df.drop_duplicates(keep='first')

In [12]:
# Display the shape of df_unique 
df_unique.shape

(691, 11)

# Converting Data Types for the Ames Housing Dataset

In [13]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter10/dataset/ames_iowa_housing.csv'

In [14]:
df = pd.read_csv(url_path)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [15]:
# Print the data type of each column
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [16]:
# convert the 'Id' column into a categorical variable
df['Id'] = df['Id'].astype('category')

In [17]:
# Convert the 'MSSubClass', 'OverallQual', and 'OverallCond' columns into categorical variables
df['MSSubClass'] = df['MSSubClass'].astype('category')
df['OverallQual'] = df['OverallQual'].astype('category')
df['OverallCond'] = df['OverallCond'].astype('category')

In [18]:
# Create a for loop that will iterate through the four categorical columns and print their names and categories
for col_name in ['Id', 'MSSubClass', 'OverallQual', 'OverallCond']:
    print(col_name)
    print(f'{df[col_name].cat.categories}\n')

Id
Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460],
           dtype='int64', length=1460)

MSSubClass
Int64Index([20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190], dtype='int64')

OverallQual
Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')

OverallCond
Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')



In [19]:
# Create a new DataFrame that will only contain variables of the object type
obj_df = df.select_dtypes(include='object')
obj_df.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [20]:
# Create a new variable that contains a list of column names from the obj_df DataFrame
obj_cols = obj_df.columns
obj_cols

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [21]:
# create a for loop that will iterate through the column names in obj_cols and print their names and unique values
for col_name in obj_cols:
    print(col_name)
    print(f'{obj_df[col_name].unique()}\n')

MSZoning
['RL' 'RM' 'C (all)' 'FV' 'RH']

Street
['Pave' 'Grvl']

Alley
[nan 'Grvl' 'Pave']

LotShape
['Reg' 'IR1' 'IR2' 'IR3']

LandContour
['Lvl' 'Bnk' 'Low' 'HLS']

Utilities
['AllPub' 'NoSeWa']

LotConfig
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']

LandSlope
['Gtl' 'Mod' 'Sev']

Neighborhood
['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']

Condition1
['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']

Condition2
['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']

BldgType
['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']

HouseStyle
['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']

RoofStyle
['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']

RoofMatl
['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']

Exterior1st
['Vin

All these columns have a finite number of unique values that are composed of text, which shows us that they are categorical variables.

In [22]:
# create a for loop that will iterate through the column names in obj_cols and convert each of them into categorical
for col_name in obj_cols:
    df[col_name] = df[col_name].astype('category')

In [23]:
# Print the data type
df.dtypes

Id               category
MSSubClass       category
MSZoning         category
LotFrontage       float64
LotArea             int64
                   ...   
MoSold              int64
YrSold              int64
SaleType         category
SaleCondition    category
SalePrice           int64
Length: 81, dtype: object

# Fixing Incorrect Values in the State Column

In [24]:
# url_path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter11/dataset/officers.csv'

In [25]:
# load the data
df = pd.read_csv(url_path)
df.head()

Unnamed: 0,ID,City,State,Zip,Title,RedactionRequested
0,804,Glenview,IL,60025,Treasurer,False
1,9177,Harrisburg,IL,62946,Treasurer,False
2,53011,Chicago,IL,60606,Treasurer,False
3,9176,Harrisburg,IL,62946,Chairman,False
4,33020,Mechanicsburg,IL,62545,Chairman,False


In [26]:
# Print out all the unique values of the State variable
df['State'].unique()

array(['IL', 'PA', 'DC', 'Il', nan, 'WI', 'CA', 'MO', 'NC', 'IA', 'MA',
       'IN', 'MI', 'TN', 'NY', 'ng', 'TX', 'CO', 'NV', 'il', 'WA', '8I',
       'In', 'iL', 'OH', 'SC', 'VA', 'NM', 'FL', 'LA', 'GA', 'II', 'NJ',
       'MD', 'I', 'AR', 'KS', 'DE', '60', 'SD', 'MN', 'VT', 'OK', 'KY',
       'CT', 'NH', 'AZ', 'OR', 'PR', 'RI'], dtype=object)

All the states have been encoded into a two-capitalized character format. As you can see, there are some incorrect values with non-capitalized characters, such as il and iL (they look like spelling errors for Illinois), and unexpected values such as 8I, I, and 60

In [27]:
# Print out the rows that have the il value in the State column
df.loc[df['State'].str.contains('il', na=False)]

Unnamed: 0,ID,City,State,Zip,Title,RedactionRequested
4245,47448,Chicago,il,60619,Treasurer,False
4651,47447,Chicago,il,60623-1614,Chairman,False
4652,54025,Chicago,il,60623-1614,Chairman,False
18939,39418,Kingston,il,60145,Chairman,False
29699,27124,Hampshire,il,60140,Chairman,False
43761,29179,McHenry,il,60050,Admin Asst,False


In [28]:
# create a for loop that will iterate through the following values in the State column: Il, iL, Il
for state in ['Il', 'iL', 'Il']:
    print(df.loc[df['State'] == state, ['City', 'State']])

            City State
43        Ottawa    Il
44        Ottawa    Il
493    Galesburg    Il
613      Chicago    Il
614      Chicago    Il
...          ...   ...
54915    Chicago    Il
54916    Chicago    Il
54918    Chicago    Il
54919    Chicago    Il
54921    Chicago    Il

[665 rows x 2 columns]
         City State
7052  Wheaton    iL
            City State
43        Ottawa    Il
44        Ottawa    Il
493    Galesburg    Il
613      Chicago    Il
614      Chicago    Il
...          ...   ...
54915    Chicago    Il
54916    Chicago    Il
54918    Chicago    Il
54919    Chicago    Il
54921    Chicago    Il

[665 rows x 2 columns]


In [29]:
# Create a condition mask to subset all the rows that contain the four incorrect values (il, Il, iL, and Il)
il_mask = df['State'].isin(['il', 'Il', 'iL', 'Il'])

In [30]:
# Print the number of rows that match the condition we set in il_mask
il_mask.sum()

672

In [31]:
# subset the rows with the il_mask condition mask and replace the value of the State column with IL
df.loc[il_mask, 'State'] = 'IL'

In [32]:
# Print out all the unique values of the State variable
df['State'].unique()

array(['IL', 'PA', 'DC', nan, 'WI', 'CA', 'MO', 'NC', 'IA', 'MA', 'IN',
       'MI', 'TN', 'NY', 'ng', 'TX', 'CO', 'NV', 'WA', '8I', 'In', 'OH',
       'SC', 'VA', 'NM', 'FL', 'LA', 'GA', 'II', 'NJ', 'MD', 'I', 'AR',
       'KS', 'DE', '60', 'SD', 'MN', 'VT', 'OK', 'KY', 'CT', 'NH', 'AZ',
       'OR', 'PR', 'RI'], dtype=object)

In [33]:
# create a for loop that iterates through the three incorrect values (I, 8I, and 60) and print out the subsetted rows
for val in ['I', '8I', '60']:
    print(df.loc[df['State'] == val, ['City', 'State']])

              City State
17596  Bloomington     I
             City State
5513  Springfield    8I
          City State
28060  Chicago    60


In [34]:
# Create a for loop that iterates through the four incorrect values (II, I, 8I, and 60) and replace the value in State with IL
for val in ['II', 'I', '8I', '60']:
    df.loc[df['State'] == val, 'State'] = 'IL'

In [35]:
df['State'].unique()

array(['IL', 'PA', 'DC', nan, 'WI', 'CA', 'MO', 'NC', 'IA', 'MA', 'IN',
       'MI', 'TN', 'NY', 'ng', 'TX', 'CO', 'NV', 'WA', 'In', 'OH', 'SC',
       'VA', 'NM', 'FL', 'LA', 'GA', 'NJ', 'MD', 'AR', 'KS', 'DE', 'SD',
       'MN', 'VT', 'OK', 'KY', 'CT', 'NH', 'AZ', 'OR', 'PR', 'RI'],
      dtype=object)

In [36]:
# Repeat but iterate through the In and ng values instead
for val in ['In', 'ng']:
    print(df.loc[df['State'] == val, ['City', 'State']])

           City State
5733  Sherville    In
            City State
2428  none given    ng
2961  none given    ng


In [37]:
# Subset the rows containing the In value in State and replace the state value with IN
df.loc[df['State'].str.contains('In', na=False), 'State'] = 'IN'

In [38]:
df['State'].unique()

array(['IL', 'PA', 'DC', nan, 'WI', 'CA', 'MO', 'NC', 'IA', 'MA', 'IN',
       'MI', 'TN', 'NY', 'ng', 'TX', 'CO', 'NV', 'WA', 'OH', 'SC', 'VA',
       'NM', 'FL', 'LA', 'GA', 'NJ', 'MD', 'AR', 'KS', 'DE', 'SD', 'MN',
       'VT', 'OK', 'KY', 'CT', 'NH', 'AZ', 'OR', 'PR', 'RI'], dtype=object)

# Fixing Missing Values for the Horse Colic Dataset

In [39]:
# url path
url_path = 'http://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter11/dataset/horse-colic.data'

In [40]:
# load the data
df = pd.read_csv(url_path, header=None, sep='\s+', prefix='X')
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27
0,2,1,530101,38.5,66,28,3,3,?,2,...,45.0,8.4,?,?,2,2,11300,0,0,2
1,1,1,534817,39.2,88,20,?,?,4,1,...,50.0,85.0,2,2,3,2,2208,0,0,2
2,2,1,530334,38.3,40,24,1,1,3,1,...,33.0,6.7,?,?,1,2,0,0,0,1
3,1,9,5290409,39.1,164,84,4,1,6,2,...,48.0,7.2,3,5.30,2,1,2208,0,0,1
4,2,1,530255,37.3,104,35,?,?,6,2,...,74.0,7.4,?,?,2,2,4300,0,0,2


The authors have used the ? character for missing values, but the pandas package thinks that this is a normal value.

In [41]:
# Reload the dataset, but this time, add the na_values='?' parameter
df = pd.read_csv(url_path, header=None, sep='\s+', prefix='X', na_values='?')
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [42]:
# Print the data type of each column
df.dtypes

X0     float64
X1       int64
X2       int64
X3     float64
X4     float64
X5     float64
X6     float64
X7     float64
X8     float64
X9     float64
X10    float64
X11    float64
X12    float64
X13    float64
X14    float64
X15    float64
X16    float64
X17    float64
X18    float64
X19    float64
X20    float64
X21    float64
X22    float64
X23      int64
X24      int64
X25      int64
X26      int64
X27      int64
dtype: object

In [43]:
# Print the number of missing values for each column
df.isna().sum()

X0       1
X1       0
X2       0
X3      60
X4      24
X5      58
X6      56
X7      69
X8      47
X9      32
X10     55
X11     44
X12     56
X13    104
X14    106
X15    247
X16    102
X17    118
X18     29
X19     33
X20    165
X21    198
X22      1
X23      0
X24      0
X25      0
X26      0
X27      0
dtype: int64

In [44]:
# Create a condition mask called x0_mask so that you can find the missing values in the X0 column
x0_mask = df['X0'].isna()

In [45]:
# Display the number of missing values for this column
x0_mask.sum()

1

In [46]:
# Extract the median of X0 
x0_median = df['X0'].median()
x0_median

1.0

We will replace all the missing values with this value in the X0 column.

In [47]:
df['X0'].fillna(x0_median, inplace=True)

In [48]:
# Create a for loop that will iterate through all the columns of the DataFrame
# In the for loop, calculate the median for each and save them into a variable called col_median.
# Then, impute missing values with this median value

for col_name in df.columns:
    median = df[col_name].median()
    df[col_name].fillna(median, inplace=True)

In [49]:
# Print the number of missing values for each column
df.isna().sum()

X0     0
X1     0
X2     0
X3     0
X4     0
X5     0
X6     0
X7     0
X8     0
X9     0
X10    0
X11    0
X12    0
X13    0
X14    0
X15    0
X16    0
X17    0
X18    0
X19    0
X20    0
X21    0
X22    0
X23    0
X24    0
X25    0
X26    0
X27    0
dtype: int64