# Import libraries

In [8]:
# Import the libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch import Dataset, DataLoader

In [None]:
device = torch.device('cude' if torch.cude.is_available() else 'cpu')

# Read and preprocess the dataset

In [2]:
# Import the dataset
train_calc = pd.read_csv('calc_case_description_train_set.csv')
test_calc = pd.read_csv('calc_case_description_test_set.csv')
train_mass = pd.read_csv('mass_case_description_train_set.csv')
test_mass = pd.read_csv('mass_case_description_test_set.csv')

In [3]:
calc = train_calc.values.tolist() + test_calc.values.tolist()
calc = pd.DataFrame(calc, columns = train_calc.columns)
mass = train_mass.values.tolist() + test_mass.values.tolist()
mass = pd.DataFrame(mass, columns = train_mass.columns)

In [4]:
# Function to create dictionaries for categorical features
def create_embedding_dict(series):
    unique_values = series.unique()
    embedding_dict = {value: i for i, value in enumerate(unique_values)}
    return embedding_dict

In [10]:
def preprocess(data):
        # make a copy of the data to avoid SettingWithCopyWarning
        data = data.copy()
        
        # set the limitations on the numerical columns
        try:
                data['breast density'] = data['breast density'].clip(1, 4)
        except KeyError:
                data['breast_density'] = data['breast_density'].clip(1, 4)
        data['abnormality id'] = data['abnormality id'].clip(0)
        data['assessment'] = data['assessment'].clip(0, 5)
        data['subtlety'] = data['subtlety'].clip(1, 5)
        
        # change the name of index
        data.index = data['patient_id'] + '_' + data['image view'] + '_' \
        + data['left or right breast'] + '_' + data['abnormality id'].astype(str)

        # Remove useless columns
        data = data[data.columns.drop(list(data.filter(regex='file path')) 
                + ['image view', 'patient_id', 'left or right breast', 'abnormality type'])]

        # Fill NaN values with appropriate placeholders
        try:
                data['calc type'] = data['calc type'].fillna('None')
                data['calc distribution'] = data['calc distribution'].fillna('None')
        except KeyError:
                data['mass shape'] = data['mass shape'].fillna('None')
                data['mass margins'] = data['mass margins'].fillna('None')
        
        '''
        pathology :
        BENIGN_WITHOUT_CALLBACK = 0
        BENIGN = 0.5
        MALIGNANT = 1
        '''
        data['pathology'] = data['pathology'].map({'BENIGN_WITHOUT_CALLBACK': 0, 'BENIGN': 0.5, 'MALIGNANT': 1})
        
        # Create embedding dictionaries for categorical features
        # and define embedding sizes
        
        try:
                calc_type_embedding_dict = create_embedding_dict(data['calc type'])
                calc_dist_embedding_dict = create_embedding_dict(data['calc distribution'])
                calc_type_embedding_size = len(calc_type_embedding_dict)
                calc_dist_embedding_size = len(calc_dist_embedding_dict)
        except KeyError:
                mass_shape_embedding_dict = create_embedding_dict(data['mass shape'])
                mass_margins_embedding_dict = create_embedding_dict(data['mass margins'])
                mass_shape_embedding_size = len(mass_shape_embedding_dict)
                mass_margins_embedding_size = len(mass_margins_embedding_dict)
        
        # Replace categorical values with their embedding indices        
        
        try:
                data['calc type'] = data['calc type'].map(calc_type_embedding_dict)
                data['calc distribution'] = data['calc distribution'].map(calc_dist_embedding_dict)
        except KeyError:
                data['mass shape'] = data['mass shape'].map(mass_shape_embedding_dict)
                data['mass shape'] = data['mass shape'].map(mass_shape_embedding_dict)
        
        # rename columns
        data.rename(columns={'abnormality id': 'number of abnormalities', 
                             'assessment' : 'overall BI-RADS assessment'}, inplace=True)
        try:
                data.rename(columns={'breast_density' : 'breast density'}, inplace=True)
                # split
                return data[:train_mass.shape[0]], data[train_mass.shape[0]:]
        except KeyError:
                return data[:train_calc.shape[0]], data[train_calc.shape[0]:]

In [13]:
train_calc, test_calc = preprocess(calc)
train_mass, test_mass = preprocess(mass)

In [14]:
embedding_dim = 10