In [131]:
# Import the libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix

In [132]:
# Import the dataset
train_calc = pd.read_csv(r'C:\Users\Harsh\Downloads\calc_case_description_train_set.csv')
test_calc = pd.read_csv(r'C:\Users\Harsh\Downloads\calc_case_description_test_set.csv')
train_mass = pd.read_csv(r'C:\Users\Harsh\Downloads\mass_case_description_train_set.csv')
test_mass = pd.read_csv(r'C:\Users\Harsh\Downloads\mass_case_description_test_set.csv')

original = {'train_calc': train_calc, 'test_calc': test_calc, 
        'train_mass': train_mass, 'test_mass': test_mass}

In [133]:
calc = train_calc.values.tolist() + test_calc.values.tolist()
calc = pd.DataFrame(calc, columns = train_calc.columns)
mass = train_mass.values.tolist() + test_mass.values.tolist()
mass = pd.DataFrame(mass, columns = train_mass.columns)

In [134]:
def preprocess(data):
        # make a copy of the data to avoid SettingWithCopyWarning
        data = data.copy()
        
        # set the limitations on the numerical columns
        try:
                data['breast density'] = data['breast density'].clip(1, 4)
        except KeyError:
                data['breast_density'] = data['breast_density'].clip(1, 4)
        data['abnormality id'] = data['abnormality id'].clip(0)
        data['assessment'] = data['assessment'].clip(0, 5)
        data['subtlety'] = data['subtlety'].clip(1, 5)
        
        # change the name of index
        data.index = data['patient_id'] + '_' + data['image view'] + '_' \
        + data['left or right breast'] + '_' + data['abnormality id'].astype(str)

        # Remove useless columns
        data = data[data.columns.drop(list(data.filter(regex='file path')) 
                + ['image view', 'patient_id', 'left or right breast', 'abnormality type'])]

        # Fill NaN values with appropriate placeholders
        try:
                data['calc type'] = data['calc type'].fillna('None')
                data['calc distribution'] = data['calc distribution'].fillna('None')
        except KeyError:
                data['mass shape'] = data['mass shape'].fillna('None')
                data['mass margins'] = data['mass margins'].fillna('None')

        '''
        pathology :
        BENIGN_WITHOUT_CALLBACK = 0
        BENIGN = 0.5
        MALIGNANT = 1
        '''
        data['pathology'] = data['pathology'].map({'BENIGN_WITHOUT_CALLBACK': 0, 'BENIGN': 1, 'MALIGNANT': 2})
        

        # Encode categorical features
        le_pathology = LabelEncoder()
        try:
                le_type = LabelEncoder()
                le_distribution = LabelEncoder()
                
                data['calc type'] = le_type.fit_transform(data['calc type'])
                data['calc distribution'] = le_distribution.fit_transform(data['calc distribution'])
                
        except KeyError:
                le_shape = LabelEncoder()
                le_distribution = LabelEncoder()
                
                data['mass shape'] = le_shape.fit_transform(data['mass shape'])
                data['mass margins'] = le_distribution.fit_transform(data['mass margins'])
        
        # rename columns
        data.rename(columns={'abnormality id': 'number of abnormalities', 
                             'assessment' : 'overall BI-RADS assessment'}, inplace=True)
        try:
                data.rename(columns={'breast_density' : 'breast density'}, inplace=True)
                # split
                return data[:train_mass.shape[0]], data[train_mass.shape[0]:]
        except KeyError:
                return data[:train_calc.shape[0]], data[train_calc.shape[0]:]

In [135]:
train_calc, test_calc = preprocess(calc)
train_mass, test_mass = preprocess(mass)

In [149]:
# GRID SEARCH WITH RANDOM FOREST CALC                            BEST SCORE

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, confusion_matrix
import pandas as pd

# Define the features for the model
features = ['breast density', 'number of abnormalities', 'calc type', 'calc distribution', 'subtlety']

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'random_state': [42]
}

# Create a random forest model object
rf = RandomForestRegressor()

# Create a GridSearchCV object to perform the search
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5, scoring='r2')

# Train the model on the calcification dataset
test_X = test_calc[features]
test_Y = test_calc['overall BI-RADS assessment']
train_X = train_calc[features]
train_Y = train_calc['overall BI-RADS assessment']

grid_search.fit(train_X, train_Y)

# Print the best hyperparameters and score
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

# Make predictions on the test set using the best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(test_X)

# Evaluate the performance of the model using a confusion matrix
conf_m = confusion_matrix(test_Y, y_pred.round())
print("Confusion Matrix:\n", conf_m)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_Y, y_pred.round())
print('Accuracy:', accuracy)

# Calculate the Mean Squared Error of the predictions
mse = mean_squared_error(test_Y, y_pred)
print('Mean Squared Error:', mse)

importances = best_rf.feature_importances_
sorted_indices = importances.argsort()[::-1]
print('Feature importances:')
for index in sorted_indices:
    print(f'{features[index]}: {importances[index]}')


Best parameters: {'max_depth': 5, 'max_features': 'auto', 'n_estimators': 200, 'random_state': 42}
Best score: 0.6274375987208354
Confusion Matrix:
 [[  0   0   6  56   0]
 [  0 135  10   4   0]
 [  0   0   2  46   0]
 [  0   0   6 229   0]
 [  0   0   2  45  13]]
Accuracy: 0.6841155234657039
Mean Squared Error: 1.894682787904892
Feature importances:
calc type: 0.5762787554983896
calc distribution: 0.35859058692488627
subtlety: 0.048586003936691914
breast density: 0.010093944124681353
number of abnormalities: 0.006450709515351024


In [146]:
# GRID SEARCH WITH RANDOM FOREST MASS                            BEST SCORE

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score
import pandas as pd

# Define the features for the model
features = ['breast density', 'number of abnormalities', 'mass shape', 'mass margins', 'subtlety']

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'random_state': [42]
}

# Create a random forest model object
rf = RandomForestRegressor()

# Create a GridSearchCV object to perform the search
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5, scoring='r2')

# Train the model on the mass dataset
train_X = train_mass[features]
train_Y = train_mass['overall BI-RADS assessment']
test_X = test_mass[features]
test_Y = test_mass['overall BI-RADS assessment']

grid_search.fit(train_X, train_Y)

# Print the best hyperparameters and score
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

# Make predictions on the test set using the best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(test_X)

# Evaluate the performance of the model using a confusion matrix
conf_m = confusion_matrix(test_Y, y_pred.round())
print("Confusion Matrix:\n", conf_m)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_Y, y_pred.round())
print('Accuracy:', accuracy)

# Calculate the Mean Squared Error of the predictions
mse = mean_squared_error(test_Y, y_pred)
print('Mean Squared Error:', mse)

importances = best_rf.feature_importances_
sorted_indices = importances.argsort()[::-1]
print('Feature importances:')
for index in sorted_indices:
    print(f'{features[index]}: {importances[index]}')


Best parameters: {'max_depth': 5, 'max_features': 'auto', 'n_estimators': 100, 'random_state': 42}
Best score: 0.3930291587159023
Confusion Matrix:
 [[ 0  6 17  4  6  0]
 [ 0  0  1  0  1  0]
 [ 0  0  7  5  2  0]
 [ 0  0 11 56 18  0]
 [ 0  3  2 56 99  9]
 [ 0  0  1  2 32 40]]
Accuracy: 0.5343915343915344
Mean Squared Error: 1.0140120402077346
Feature importances:
mass margins: 0.7678348954349647
breast density: 0.09594913594746818
subtlety: 0.0648055608614647
mass shape: 0.051243278271819424
number of abnormalities: 0.02016712948428291


In [153]:
# GRID SEARCH WITH RANDOM FOREST CALC                            BEST SCORE

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
import pandas as pd

# Define the features for the model
features = ['breast density', 'number of abnormalities', 'calc type', 'calc distribution', 'subtlety', 'overall BI-RADS assessment']

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'random_state': [42]
}

# Create a random forest model object
rf = RandomForestRegressor()

# Create a GridSearchCV object to perform the search
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5, scoring='r2')

# Train the model on the calcification dataset
train_X = train_calc[features]
train_Y = train_calc['pathology']

grid_search.fit(train_X, train_Y)

# Print the best hyperparameters and score
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

best_rf = grid_search.best_estimator_

# Make predictions on the test set
test_X = test_calc[features]
test_Y = test_calc['pathology']
test_pred = best_rf.predict(test_X)

# Create a confusion matrix
conf_m = confusion_matrix(test_Y, test_pred.round())
print('Confusion matrix:')
print(conf_m)

# Calculate accuracy
accuracy = accuracy_score(test_Y, test_pred.round())
print('Accuracy:', accuracy)

# Calculate MSE
mse = mean_squared_error(test_Y, test_pred)
print('Mean Squared Error:', mse)

# Print feature importances
importances = best_rf.feature_importances_
sorted_indices = importances.argsort()[::-1]
print('Feature importances:')
for index in sorted_indices:
    print(f'{features[index]}: {importances[index]}')



Best parameters: {'max_depth': 5, 'max_features': 'auto', 'n_estimators': 200, 'random_state': 42}
Best score: 0.7412307430177936
Confusion matrix:
[[145   0   0]
 [  2 161  19]
 [  0 124 103]]
Accuracy: 0.7382671480144405
Mean Squared Error: 0.16154086222619435
Feature importances:
overall BI-RADS assessment: 0.9359343747361912
calc type: 0.021166143003026803
subtlety: 0.020907248766062123
calc distribution: 0.0147948805072604
breast density: 0.0041131955481095665
number of abnormalities: 0.00308415743934989


In [154]:
# GRID SEARCH WITH RANDOM FOREST MASS                            BEST SCORE

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
import pandas as pd

# Define the features for the model
features = ['breast density', 'number of abnormalities', 'mass shape', 'mass margins', 'subtlety', 'overall BI-RADS assessment']

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'random_state': [42]
}

# Create a random forest model object
rf = RandomForestRegressor()

# Create a GridSearchCV object to perform the search
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5, scoring='r2')

# Train the model on the mass dataset
train_X = train_mass[features]
train_Y = train_mass['pathology']

grid_search.fit(train_X, train_Y)

# Print the best hyperparameters and score
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

best_rf = grid_search.best_estimator_

# Make predictions on the test set
test_X = test_mass[features]
test_Y = test_mass['pathology']
test_pred = best_rf.predict(test_X)

# Create a confusion matrix
conf_m = confusion_matrix(test_Y, test_pred.round())
print('Confusion matrix:')
print(conf_m)

# Calculate accuracy
accuracy = accuracy_score(test_Y, test_pred.round())
print('Accuracy:', accuracy)

# Calculate MSE
mse = mean_squared_error(test_Y, test_pred)
print('Mean Squared Error:', mse)

# Print feature importances
importances = best_rf.feature_importances_
sorted_indices = importances.argsort()[::-1]
print('Feature importances:')
for index in sorted_indices:
    print(f'{features[index]}: {importances[index]}')


Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 200, 'random_state': 42}
Best score: 0.5130567581564397
Confusion matrix:
[[ 15  15   7]
 [  1 153  40]
 [  0  28 119]]
Accuracy: 0.7592592592592593
Mean Squared Error: 0.23168943629489586
Feature importances:
overall BI-RADS assessment: 0.410302942037519
mass margins: 0.24783724935083856
mass shape: 0.1698787033060306
subtlety: 0.08145129946883534
breast density: 0.06968080623978576
number of abnormalities: 0.02084899959699087
