# Triple Breast Classification Model

## 1. Import Necessary Libraries

In [159]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


## 2. Load Data

In [141]:
train_data = pd.read_csv('training.csv')
test_data = pd.read_csv('test.csv')

In [142]:
# Checking a sample of the dataset
train_data.sample(5)

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02,DiagPeriodL90D
5422,960645,,COMMERCIAL,MI,488,60,F,,C50911,Malignant neoplasm of unsp site of right femal...,...,14.288608,12.218182,0.350649,26.850649,7.65443,7.139241,38.907588,7.65125,12.554817,0
4259,873538,,COMMERCIAL,IN,465,61,F,,C50412,Malig neoplasm of upper-outer quadrant of left...,...,13.856522,11.469565,2.230435,22.390909,14.030435,6.602174,38.487245,8.143639,12.049427,1
4012,464822,White,MEDICAID,FL,342,40,F,,C50912,Malignant neoplasm of unspecified site of left...,...,14.25625,7.908333,2.275,25.497826,10.685417,9.783333,35.851859,7.44954,8.008309,1
1695,888971,,MEDICARE ADVANTAGE,PA,190,71,F,21.73,1749,"Malignant neoplasm of breast (female), unspeci...",...,11.276471,7.638235,2.245588,28.422059,4.257353,5.410294,39.324499,8.123492,22.987074,0
2005,362673,Hispanic,MEDICARE ADVANTAGE,CA,939,82,F,,1748,Malignant neoplasm of other specified sites of...,...,10.895652,11.508696,8.863636,23.686957,8.321739,10.45,39.139001,5.202427,7.812162,0


In [143]:
test_data.sample(5)

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02
2621,527559,,COMMERCIAL,OH,452,56,F,,C50912,Malignant neoplasm of unspecified site of left...,...,3.854348,12.246667,18.073333,1.384444,23.182222,5.931111,5.904348,39.618075,8.563063,21.980447
3525,798071,Hispanic,,CA,939,48,F,,C50412,Malig neoplasm of upper-outer quadrant of left...,...,44.2875,10.895652,11.508696,8.863636,23.686957,8.321739,10.45,39.139001,5.202427,7.812162
5589,931123,,COMMERCIAL,MN,551,61,F,27.81,C50412,Malig neoplasm of upper-outer quadrant of left...,...,6.596774,10.867742,10.883871,3.948387,23.346667,4.312903,6.019355,34.204407,6.651215,15.277832
2206,698312,Hispanic,COMMERCIAL,CA,902,63,F,,C50919,Malignant neoplasm of unsp site of unspecified...,...,40.0,8.434286,10.288235,8.267647,29.942857,7.465714,3.14,41.206093,9.665553,19.714589
4848,410759,White,MEDICARE ADVANTAGE,NY,121,83,F,,C50112,Malignant neoplasm of central portion of left ...,...,3.672059,13.920588,7.898529,0.670588,27.130159,3.166176,7.560294,38.841811,5.212827,13.582484


In [144]:
# Print the shape of the datasets
print(f'The shape of the train dataset is {(len(train_data), len(train_data.columns))}')
print(f'The shape of the test dataset is {(len(test_data), len(test_data.columns))}')

The shape of the train dataset is (12906, 83)
The shape of the test dataset is (5792, 82)


## 3. Feature Engineering

### 3.1 Handling Missing Values

In [145]:
# Check Missing values
print(f'Total number of missing values is {train_data.isnull().sum().sum()}')
print(f'Total number of missing values is {test_data.isnull().sum().sum()}')

Total number of missing values is 43292
Total number of missing values is 19366


In [146]:
# Replace missing values

cleaned_train_data = train_data.fillna(method='bfill', axis=0)
cleaned_test_data = test_data.fillna(method='bfill', axis=0)

In [147]:
# Print the shape of the cleaned datasets
print(f'The sum of null values in the cleaned train dataset is {cleaned_train_data.isnull().sum().sum()}')
print(f'The sum of null values in the cleaned test dataset is {cleaned_test_data.isnull().sum().sum()}')

The sum of null values in the cleaned train dataset is 111
The sum of null values in the cleaned test dataset is 114


In [148]:
# Drop the remaining missing values
cleaned_train_data.dropna(how='any', axis=0, inplace=True)
cleaned_test_data.dropna(how='any', axis=0, inplace=True)

### 3.2 One Hot Encoding

In [156]:
Encoding = OneHotEncoder()

# Identify columns that have strings
# string_columns = cleaned_train_data.select_dtypes(include=['object']).columns.tolist()

# Encoding
"""
fit() is used to analyze the unique/ categorical features that need to be encoded while
fit_transform() combines both fit and transform methods. 
"""
encoded_train_dataset = Encoding.fit_transform(cleaned_train_data)
encoded_test_dataset = Encoding.fit_transform(cleaned_test_data)


### 3.3 Correlation Analysis

In [157]:
# Checking the correlation between the independent and dependent features
encoded_train_dataset.corr()

AttributeError: 'csr_matrix' object has no attribute 'corr'