# Triple Breast Classification Model

## 1. Import Necessary Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


## 2. Load Data

In [2]:
train_data = pd.read_csv('training.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Checking a sample of the dataset
train_data.sample(5)

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02,DiagPeriodL90D
5771,586916,Black,,PA,191,53,F,,C50511,Malig neoplm of lower-outer quadrant of right ...,...,15.802128,20.178723,5.86383,32.8,6.293617,4.229787,39.06182,8.85161,24.544244,1
2170,153379,,COMMERCIAL,OR,971,57,F,25.33,1744,Malignant neoplasm of upper-outer quadrant of ...,...,15.95122,9.202439,1.217073,25.175,5.595122,9.239024,33.375,4.524903,10.22189,1
9104,720274,,COMMERCIAL,WA,980,54,F,,C50919,Malignant neoplasm of unsp site of unspecified...,...,9.75283,6.432075,5.09434,31.275472,5.309434,5.807547,36.618644,4.939852,23.39365,1
9867,269063,White,MEDICAID,IA,516,79,F,,C50919,Malignant neoplasm of unsp site of unspecified...,...,14.74,9.595,0.32,21.805263,5.39,8.655,37.042268,6.752691,9.686922,0
8127,908196,Black,MEDICAID,GA,300,52,F,,C50811,Malignant neoplasm of ovrlp sites of right fem...,...,10.270492,10.77541,4.304918,32.501667,13.040984,6.503279,40.968089,9.488443,19.108904,1


In [4]:
test_data.sample(5)

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02
2932,100266,,MEDICAID,CA,900,59,F,,C50912,Malignant neoplasm of unspecified site of left...,...,45.526154,11.901538,20.76,14.7375,30.709375,10.341538,3.030769,41.186992,11.166898,21.644261
4434,495140,,COMMERCIAL,IL,620,63,F,,C50911,Malignant neoplasm of unsp site of right femal...,...,1.74,18.653333,13.498649,0.243243,30.008333,6.508,9.977333,39.128294,8.164013,11.707282
2644,620217,White,MEDICAID,OH,440,58,F,,C50411,Malig neoplm of upper-outer quadrant of right ...,...,4.252632,12.303509,10.991228,1.331579,26.750909,7.594737,7.914035,39.89356,7.673232,15.246437
1319,413697,Asian,MEDICARE ADVANTAGE,CA,917,62,F,,C50412,Malig neoplasm of upper-outer quadrant of left...,...,47.726087,9.895652,10.515217,12.745652,32.530435,7.263043,3.81087,47.310325,9.595719,20.084231
1880,953242,,COMMERCIAL,NY,125,24,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,9.939683,12.719048,8.312698,1.463492,30.645,3.871429,5.630159,39.263883,5.968842,13.904849


In [5]:
# Print the shape of the datasets
print(f'The shape of the train dataset is {(len(train_data), len(train_data.columns))}')
print(f'The shape of the test dataset is {(len(test_data), len(test_data.columns))}')

The shape of the train dataset is (12906, 83)
The shape of the test dataset is (5792, 82)


## 3. Feature Engineering

### 3.1 Handling Missing Values

In [6]:
# Check Missing values
print(f'Total number of missing values is {train_data.isnull().sum().sum()}')
print(f'Total number of missing values is {test_data.isnull().sum().sum()}')

Total number of missing values is 43292
Total number of missing values is 19366


In [7]:
# Replace missing values

cleaned_train_data = train_data.fillna(method='bfill', axis=0)
cleaned_test_data = test_data.fillna(method='bfill', axis=0)

In [8]:
# Print the shape of the cleaned datasets
print(f'The sum of null values in the cleaned train dataset is {cleaned_train_data.isnull().sum().sum()}')
print(f'The sum of null values in the cleaned test dataset is {cleaned_test_data.isnull().sum().sum()}')

The sum of null values in the cleaned train dataset is 111
The sum of null values in the cleaned test dataset is 114


In [9]:
# Drop the remaining missing values
cleaned_train_data.dropna(how='any', axis=0, inplace=True)
cleaned_test_data.dropna(how='any', axis=0, inplace=True)

In [10]:
# Splitting the data
X_train = cleaned_train_data.drop('DiagPeriodL90D', axis=1)
y_train = cleaned_train_data['DiagPeriodL90D']

### 3.2 One Hot Encoding

In [11]:
# One-hot encoding for train data
string_columns_train = X_train.select_dtypes(include=['object']).columns.tolist()
encoded_train_dataset = pd.get_dummies(X_train, columns=string_columns_train)

### 3.3 Correlation Analysis

In [12]:

dependent_variable_train = y_train

# Concatenate the encoded datasets with the dependent variable for both train and test datasets
merged_train_df = pd.concat([X_train.drop(columns=string_columns), encoded_train_dataset], axis=1)

# Calculate the correlation between features and the dependent variable for train and test datasets
correlation_train = merged_train_df.corrwith(dependent_variable_train)

# Print or use the correlation matrices as needed
print("Correlation with dependent variable for training dataset:")
print(correlation_train)



NameError: name 'string_columns' is not defined

In [None]:
# Dropping values with a low correlation

# Set the correlation threshold
correlation_threshold = 0  # Adjust this threshold as needed

# Identify variables with correlation below the threshold
low_correlation_vars_train = correlation_train[abs(correlation_train) < correlation_threshold].index

# Drop the low correlation variables from both the train and test datasets
selected_train_df = merged_train_df.drop(columns=low_correlation_vars_train)

# Print or use the selected datasets as needed
print("Selected train dataset:")
print(len(selected_train_df.columns))


Selected train dataset:
360


## 4. Model Creation

In [None]:
# Initialize SVM
svm = SVC()

classification_model = svm.fit(selected_train_df, y_train)

## 5. Model Prediction and Accuracy

In [None]:
# Data preprocessing for test data
cleaned_test_data = test_data.fillna(method='bfill', axis=0)
X_test = cleaned_test_data  # No need to drop 'DiagPeriodL90D' as it's not present in test data
string_columns_test = X_test.select_dtypes(include=['object']).columns.tolist()
encoded_test_dataset = pd.get_dummies(X_test, columns=string_columns_test)

# Drop low correlation variables from test data
merged_test_df = pd.concat([X_test.drop(columns=string_columns_test), encoded_test_dataset], axis=1)
selected_test_df = merged_test_df.drop(columns=low_correlation_vars_train)

In [None]:
print("Columns in selected_train_df:", selected_train_df.columns)
print("Columns in selected_test_df:", selected_test_df.columns)

Columns in selected_train_df: Index(['patient_id', 'patient_zip3', 'patient_age', 'bmi', 'population',
       'density', 'age_median', 'age_under_10', 'age_10_to_19', 'age_20s',
       ...
       'Region_West', 'Division_East North Central',
       'Division_East South Central', 'Division_Middle Atlantic',
       'Division_Mountain', 'Division_New England', 'Division_Pacific',
       'Division_South Atlantic', 'Division_West North Central',
       'Division_West South Central'],
      dtype='object', length=360)
Columns in selected_test_df: Index(['patient_id', 'patient_zip3', 'patient_age', 'bmi', 'population',
       'density', 'age_median', 'age_under_10', 'age_10_to_19', 'age_20s',
       ...
       'Region_West', 'Division_East North Central',
       'Division_East South Central', 'Division_Middle Atlantic',
       'Division_Mountain', 'Division_New England', 'Division_Pacific',
       'Division_South Atlantic', 'Division_West North Central',
       'Division_West South Central'],

In [None]:
# Get the columns present in selected_test_df but not in selected_train_df
extra_columns_in_test = set(selected_test_df.columns) - set(selected_train_df.columns)

# Drop the extra columns from selected_test_df
selected_test_df.drop(columns=extra_columns_in_test, inplace=True)

In [None]:
# Check for duplicate column names in selected_train_df
duplicate_columns_train = selected_train_df.columns[selected_train_df.columns.duplicated()]
print("Duplicate columns in selected_train_df:", duplicate_columns_train)

# Check for duplicate column names in selected_test_df
duplicate_columns_test = selected_test_df.columns[selected_test_df.columns.duplicated()]
print("Duplicate columns in selected_test_df:", duplicate_columns_test)

# Resolve duplicate column names if any
if len(duplicate_columns_train) > 0 or len(duplicate_columns_test) > 0:
    # Rename or drop duplicate columns as needed
    # For example, you can rename duplicate columns in selected_test_df
    selected_test_df = selected_test_df.add_suffix('_test')


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np

# Make predictions on the test data
predictions = classification_model.predict(selected_test_df)

# Obtain decision scores for test data
decision_scores_test = classification_model.decision_function(selected_test_df)

# Convert decision scores to probabilities using a sigmoid function
predicted_probabilities_test = 1 / (1 + np.exp(-decision_scores_test))

# Calculate the ROC curve and AUC for the test data
fpr, tpr, thresholds = roc_curve(predictions, predicted_probabilities_test)
auc_score = roc_auc_score(predictions, predicted_probabilities_test)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()



ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- breast_cancer_diagnosis_code_1759
- breast_cancer_diagnosis_code_C50
- breast_cancer_diagnosis_code_C5001
- breast_cancer_diagnosis_code_C50021
- breast_cancer_diagnosis_code_C5011
- ...
