# Import libraries

In [17]:
# Import the libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Read and preprocess the dataset

In [27]:
# Import the dataset
train_calc = pd.read_csv('calc_case_description_train_set.csv') # (1546, 14)
test_calc = pd.read_csv('calc_case_description_test_set.csv') # (326, 14)
train_mass = pd.read_csv('mass_case_description_train_set.csv') # (1318, 14)
test_mass = pd.read_csv('mass_case_description_test_set.csv') # (378, 14)

In [28]:
original = {'train_calc': train_calc, 'test_calc': test_calc, 
        'train_mass': train_mass, 'test_mass': test_mass}

In [29]:
calc = train_calc.values.tolist() + test_calc.values.tolist()
calc = pd.DataFrame(calc, columns = train_calc.columns)
mass = train_mass.values.tolist() + test_mass.values.tolist()
mass = pd.DataFrame(mass, columns = train_mass.columns)

In [6]:
def preprocess(data):
        # make a copy of the data to avoid SettingWithCopyWarning
        data = data.copy()
        
        # set the limitations on the numerical columns
        try:
                data['breast density'] = data['breast density'].clip(1, 4)
        except KeyError:
                data['breast_density'] = data['breast_density'].clip(1, 4)
        data['abnormality id'] = data['abnormality id'].clip(0)
        data['assessment'] = data['assessment'].clip(0, 5)
        data['subtlety'] = data['subtlety'].clip(1, 5)
        
        # change the name of index
        data.index = data['patient_id'] + '_' + data['image view'] + '_' \
        + data['left or right breast'] + '_' + data['abnormality id'].astype(str)

        # Remove useless columns
        data = data[data.columns.drop(list(data.filter(regex='file path')) 
                + ['image view', 'patient_id', 'left or right breast', 'abnormality type'])]

        # Fill NaN values with appropriate placeholders
        try:
                data['calc type'] = data['calc type'].fillna('None')
                data['calc distribution'] = data['calc distribution'].fillna('None')
        except KeyError:
                data['mass shape'] = data['mass shape'].fillna('None')
                data['mass margins'] = data['mass margins'].fillna('None')

        '''
        pathology :
        BENIGN_WITHOUT_CALLBACK = 0
        BENIGN = 0.5
        MALIGNANT = 1
        '''
        data['pathology'] = data['pathology'].map({'BENIGN_WITHOUT_CALLBACK': 0, 'BENIGN': 0.5, 'MALIGNANT': 1})
        

        # Encode categorical features
        le_pathology = LabelEncoder()
        try:
                le_type = LabelEncoder()
                le_distribution = LabelEncoder()
                
                data['calc type'] = le_type.fit_transform(data['calc type'])
                data['calc distribution'] = le_distribution.fit_transform(data['calc distribution'])
                
        except KeyError:
                le_shape = LabelEncoder()
                le_distribution = LabelEncoder()
                
                data['mass shape'] = le_shape.fit_transform(data['mass shape'])
                data['mass margins'] = le_distribution.fit_transform(data['mass margins'])
        
        # rename columns
        data.rename(columns={'abnormality id': 'number of abnormalities', 
                             'assessment' : 'overall BI-RADS assessment'}, inplace=True)
        try:
                data.rename(columns={'breast_density' : 'breast density'}, inplace=True)
        except KeyError:
                pass

        return data

In [30]:
calc = preprocess(calc)
mass = preprocess(mass)
train_calc, test_calc = calc[:train_calc.shape[0]], calc[train_calc.shape[0]:]
train_mass, test_mass = mass[:train_mass.shape[0]], mass[train_mass.shape[0]:]

train_calc # (1546, 14)

test_calc # (326, 14)

train_mass # (1318, 14)

test_mass # (378, 14)

In [31]:
train_calc.shape

(1546, 7)

In [32]:
test_calc.shape

(326, 7)

In [33]:
train_mass.shape

(1318, 7)

In [34]:
test_mass.shape

(378, 7)