# Melanoma Tumor Size Prediction

# 3. Modeling

## 3.1 Imports

In [1]:
# Import the libraries necessary for the current task
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (train_test_split, RandomizedSearchCV, GridSearchCV, learning_curve, cross_validate,
                                     cross_val_score, cross_val_predict)
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from keras.models import Sequential
from keras.layers import Dense

# pandas Configuration
pd.set_option("max_rows", 120)
pd.set_option("max_columns", 120)
pd.set_option("display.max_colwidth", None)

## 3.2 Data

### 3.2.1 Data Loading

In [2]:
# Load the CSV data
melanoma_data = pd.read_csv(r"C:\Users\lastr\Desktop\GitHub\Melanoma_Capstone\data\melanoma.csv")

### 3.2.2 Numerical Data Verification

In [3]:
# Ensure that all values are numerical
melanoma_data[~melanoma_data.applymap(np.isreal).all(1)]

Unnamed: 0,mass_npea,size_npear,malign_ratio,damage_size,exposed_area,std_dev_malign,err_malign,malign_penalty,damage_ratio,tumor_size


### 3.2.3 Data Overview

In [4]:
# Summary of the data
melanoma_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9146 entries, 0 to 9145
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mass_npea       9146 non-null   float64
 1   size_npear      9146 non-null   float64
 2   malign_ratio    9146 non-null   float64
 3   damage_size     9146 non-null   float64
 4   exposed_area    9146 non-null   float64
 5   std_dev_malign  9146 non-null   float64
 6   err_malign      9146 non-null   float64
 7   malign_penalty  9146 non-null   int64  
 8   damage_ratio    9146 non-null   float64
 9   tumor_size      9146 non-null   float64
dtypes: float64(9), int64(1)
memory usage: 714.7 KB


In [5]:
# First 5 entries of the data
melanoma_data.head()

Unnamed: 0,mass_npea,size_npear,malign_ratio,damage_size,exposed_area,std_dev_malign,err_malign,malign_penalty,damage_ratio,tumor_size
0,6930.9,2919.02,0.42116,51.8298,988829.4,109.487,2758.76,72,39.362,14.103
1,15635.7,4879.36,0.31206,223.55,2058426.0,248.881,5952.53,240,22.0253,2.648
2,10376.2,2613.88,0.25191,127.337,1434676.0,160.093,4635.26,73,29.9963,1.688
3,13093.8,4510.06,0.34444,155.44,1812195.0,173.015,5273.87,32,28.1354,3.796
4,7545.21,2882.36,0.38201,85.1237,1043918.0,124.414,3263.35,57,35.02,18.023


## 3.3 Preparation for Modeling

### 3.3.1 Training Test Split

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    melanoma_data.drop("tumor_size", axis=1), melanoma_data["tumor_size"], test_size=0.3, random_state=100)

### 3.3.2 Feature Standardization

In [7]:
# Standardize features
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

## 3.4 Models

### 3.4.1 Multiple Linear Regression

#### 3.4.1.1 Model Training

In [8]:
# Train the model on the train split
mlr = LinearRegression()
mlr.fit(X_train_sc, y_train)

LinearRegression()

#### 3.4.1.2 Model Predictions

In [9]:
# Make predictions on the train and test sets
y_train_pred = mlr.predict(X_train_sc)
y_test_pred = mlr.predict(X_test_sc)

#### 3.4.1.2 Model Performance Assessment on Training and Test Set

In [10]:
# R2
r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)

(0.287127219305388, 0.29305640839378844)

In [11]:
# MAE
mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)

(4.281348842094409, 4.301117595190667)

### 3.4.2 Random Forest

#### 3.4.2.1 Model Training

In [12]:
# Train the model on the train split
rf = RandomForestRegressor(random_state=100)
rf.fit(X_train_sc, y_train)

RandomForestRegressor(random_state=100)

#### 3.4.2.2 Hyperparameter Tuning Using Randomized Search

In [13]:
# Parameter Grid for Randomized Search
rf_param_grid = {
    "n_estimators": [20, 40, 60, 80, 100],
    "criterion": ["mse", "mae"],
    "max_depth": [5, 10, 15, 20, 30],
    "min_samples_split": [2, 5, 10, 15, 25],
    "min_samples_leaf": [1, 2, 5, 8, 10],
    "max_features": ["auto", "sqrt", "log2"]
}

# Perform Randomized Search
rf_rs = RandomizedSearchCV(estimator=rf, param_distributions=rf_param_grid, n_iter=50, scoring="neg_mean_absolute_error",
                           n_jobs=-1, verbose=3, cv=3)
rf_rs.fit(X_train_sc, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'criterion': ['mse', 'mae'],
                                        'max_depth': [5, 10, 15, 20, 30],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 5, 8, 10],
                                        'min_samples_split': [2, 5, 10, 15, 25],
                                        'n_estimators': [20, 40, 60, 80, 100]},
                   scoring='neg_mean_absolute_error', verbose=3)

In [14]:
# `best_params_` attribute of `rf_rs`
rf_rs.best_params_

{'n_estimators': 40,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 30,
 'criterion': 'mse'}

#### 3.4.2.3 Model Predictions

In [16]:
# Make predictions on the train and test sets
y_train_pred = rf_rs.best_estimator_.predict(X_train_sc)
y_test_pred = rf_rs.best_estimator_.predict(X_test_sc)

#### 3.4.2.4 Model Performance Assessment on Training and Test Set

In [17]:
# R2
r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)

(0.8873364016328348, 0.5601004761891937)

In [18]:
# MAE
mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)

(1.439122610998442, 2.9224120248669125)

### 3.4.3 Support Vector Machine

#### 3.4.3.1 Model Training

In [19]:
# Train the model on the train split
svm = SVR()
svm.fit(X_train_sc, y_train)

SVR()

#### 3.4.3.2 Hyperparameter Tuning Using Randomized Search

In [20]:
# Parameter Grid for Randomized Search
svm_param_grid = {
    "kernel": ["linear", "rbf", "poly", "sigmoid"],
    "gamma": ["scale", "auto"],
    "C": [0.0001, 0.001, 0.01, 1, 10, 100],
    "epsilon": [0.0001, 0.001, 0.01, 0.1]
}

# Perform Randomized Search
svm_rs = RandomizedSearchCV(estimator=svm, param_distributions=svm_param_grid, n_iter=50, scoring="neg_mean_absolute_error",
                            n_jobs=-1, verbose=3, cv=3)
svm_rs.fit(X_train_sc, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3, estimator=SVR(), n_iter=50, n_jobs=-1,
                   param_distributions={'C': [0.0001, 0.001, 0.01, 1, 10, 100],
                                        'epsilon': [0.0001, 0.001, 0.01, 0.1],
                                        'gamma': ['scale', 'auto'],
                                        'kernel': ['linear', 'rbf', 'poly',
                                                   'sigmoid']},
                   scoring='neg_mean_absolute_error', verbose=3)

In [21]:
# `best_params_` attribute of `svm_rs`
svm_rs.best_params_

{'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.01, 'C': 100}

#### 3.4.3.3 Model Predictions

In [22]:
# Make predictions on the train and test sets
y_train_pred = svm_rs.best_estimator_.predict(X_train_sc)
y_test_pred = svm_rs.best_estimator_.predict(X_test_sc)

#### 3.4.3.4 Model Performance Assessment on Training and Test Set

In [23]:
# R2
r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)

(0.4688674083173706, 0.424230967684771)

In [24]:
# MAE
mean_absolute_error(y_train, y_train_pred), mean_absolute_error(y_test, y_test_pred)

(3.1230166781154827, 3.3532750573987786)

### 3.4.4 Neural Network