## Check and Resolve Missing Data - Univariate Imputation

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer

In [2]:
csv_train_file = pd.read_csv('./siim-isic-melanoma-classification/train.csv')
csv_test_file = pd.read_csv('./siim-isic-melanoma-classification/test.csv')

csv_train_file.name = 'Training Set'
csv_test_file.name = 'Test Set'

In [3]:
print(csv_train_file.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33126 entries, 0 to 33125
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   image_name                     33126 non-null  object 
 1   patient_id                     33126 non-null  object 
 2   sex                            33061 non-null  object 
 3   age_approx                     33058 non-null  float64
 4   anatom_site_general_challenge  32599 non-null  object 
 5   diagnosis                      33126 non-null  object 
 6   benign_malignant               33126 non-null  object 
 7   target                         33126 non-null  int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 2.0+ MB
None


In [4]:
print(csv_test_file.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10982 entries, 0 to 10981
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   image_name                     10982 non-null  object 
 1   patient_id                     10982 non-null  object 
 2   sex                            10982 non-null  object 
 3   age_approx                     10982 non-null  float64
 4   anatom_site_general_challenge  10631 non-null  object 
dtypes: float64(1), object(4)
memory usage: 429.1+ KB
None


### Training Data

In [5]:
categorical_cols = ['sex', 'anatom_site_general_challenge']
numerical_cols = ['age_approx']
all_cols = categorical_cols + numerical_cols

cat_cols = csv_train_file[categorical_cols].copy()
num_cols = csv_train_file[numerical_cols].copy()

In [6]:
# Imputation of categorical data
imputer_cat_train = SimpleImputer(strategy='constant', fill_value='none')
imputed_cat_train = pd.DataFrame(imputer_cat_train.fit_transform(cat_cols))
imputed_cat_train.columns = cat_cols.columns

In [7]:
# Imputation of numerical data
imputer_num_train = SimpleImputer(strategy='constant', fill_value=0)
imputed_num_train = pd.DataFrame(imputer_num_train.fit_transform(num_cols))
imputed_num_train.columns = num_cols.columns

In [8]:
csv_train_file = csv_train_file.drop(all_cols, axis=1)
csv_train_file = pd.concat([csv_train_file, imputed_cat_train, imputed_num_train], axis=1)

In [9]:
print(csv_train_file.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33126 entries, 0 to 33125
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   image_name                     33126 non-null  object 
 1   patient_id                     33126 non-null  object 
 2   diagnosis                      33126 non-null  object 
 3   benign_malignant               33126 non-null  object 
 4   target                         33126 non-null  int64  
 5   sex                            33126 non-null  object 
 6   anatom_site_general_challenge  33126 non-null  object 
 7   age_approx                     33126 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 2.0+ MB
None


### Test Data

In [10]:
categorical_cols = ['anatom_site_general_challenge']

cat_cols = csv_test_file[categorical_cols].copy()

# Imputation of categorical data
imputer_cat_test = SimpleImputer(strategy='constant', fill_value='none')
imputed_cat_test = pd.DataFrame(imputer_cat_test.fit_transform(cat_cols))
imputed_cat_test.columns = cat_cols.columns

csv_test_file = csv_test_file.drop(cat_cols, axis=1)
csv_test_file = pd.concat([csv_test_file, imputed_cat_test], axis=1)

print(csv_test_file.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10982 entries, 0 to 10981
Data columns (total 5 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   image_name                     10982 non-null  object 
 1   patient_id                     10982 non-null  object 
 2   sex                            10982 non-null  object 
 3   age_approx                     10982 non-null  float64
 4   anatom_site_general_challenge  10982 non-null  object 
dtypes: float64(1), object(4)
memory usage: 429.1+ KB
None


#### Check Again For Missing Data

In [11]:
csv_train_file.name = 'Training Set'
csv_test_file.name = 'Test Set'

dfs = [csv_train_file, csv_test_file]

def display_missing(df):    
    for col in df.columns.tolist():          
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
for df in dfs:
    print('{}'.format(df.name))
    display_missing(df)

Training Set
image_name column missing values: 0
patient_id column missing values: 0
diagnosis column missing values: 0
benign_malignant column missing values: 0
target column missing values: 0
sex column missing values: 0
anatom_site_general_challenge column missing values: 0
age_approx column missing values: 0


Test Set
image_name column missing values: 0
patient_id column missing values: 0
sex column missing values: 0
age_approx column missing values: 0
anatom_site_general_challenge column missing values: 0




In [12]:
# Save the files
csv_train_file.to_csv('./cleaned_csvs/train_clean_nul.csv', index=False)
csv_test_file.to_csv('./cleaned_csvs/test_clean_nul.csv', index=False)