# Triple Breast Classification Model

## 1. Import Necessary Libraries

In [270]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC


## 2. Load Data

In [271]:
train_data = pd.read_csv('training.csv')
test_data = pd.read_csv('test.csv')

In [272]:
# Checking a sample of the dataset
train_data.sample(5)

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02,DiagPeriodL90D
6616,815186,,COMMERCIAL,SC,298,67,F,,C50912,Malignant neoplasm of unspecified site of left...,...,19.532353,17.276471,0.835294,28.71875,11.9,8.031429,39.644829,8.380638,7.643347,0
10991,288131,Other,COMMERCIAL,IN,465,63,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,13.856522,11.469565,2.230435,22.390909,14.030435,6.602174,38.487245,8.143639,12.049427,0
12245,847785,,COMMERCIAL,AL,354,25,F,29.78,C50911,Malignant neoplasm of unsp site of right femal...,...,18.584848,22.11875,0.390625,29.433333,11.212121,9.318182,38.472478,8.230404,9.672811,1
6422,478772,Other,COMMERCIAL,VA,229,54,F,,C50211,Malig neoplm of upper-inner quadrant of right ...,...,11.152632,9.526316,0.471053,28.04,7.652632,7.484211,41.096892,6.959535,9.503919,0
9764,173989,,COMMERCIAL,NC,287,64,F,,C50411,Malig neoplm of upper-outer quadrant of right ...,...,15.908451,11.235211,1.319718,23.328358,14.076056,8.747887,42.744659,5.939524,6.948642,1


In [273]:
test_data.sample(5)

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02
5484,689933,White,COMMERCIAL,MI,490,65,F,,C50912,Malignant neoplasm of unspecified site of left...,...,5.783544,16.339241,12.713924,0.786076,24.071053,7.04557,7.626582,37.961042,7.972556,13.942055
3503,283199,White,,CO,806,47,F,,C50912,Malignant neoplasm of unspecified site of left...,...,25.884615,12.273077,10.265385,3.015385,31.429167,8.946154,10.434615,46.530398,6.590523,22.109347
5022,412125,White,MEDICARE ADVANTAGE,WI,543,69,F,,C50812,Malignant neoplasm of ovrlp sites of left fema...,...,11.942857,12.316667,12.316667,2.866667,18.116667,6.483333,6.071429,36.128271,7.109433,16.961529
3008,604451,,COMMERCIAL,OK,731,79,F,,C50911,Malignant neoplasm of unsp site of right femal...,...,17.986047,13.727907,16.876744,3.383721,20.632558,14.034884,10.248837,40.227465,7.647378,19.744309
5284,601142,,COMMERCIAL,GA,300,65,F,,C50912,Malignant neoplasm of unspecified site of left...,...,11.842623,10.270492,10.77541,4.304918,32.501667,13.040984,6.503279,40.968089,9.488443,19.108904


In [274]:
# Print the shape of the datasets
print(f'The shape of the train dataset is {(len(train_data), len(train_data.columns))}')
print(f'The shape of the test dataset is {(len(test_data), len(test_data.columns))}')

The shape of the train dataset is (12906, 83)
The shape of the test dataset is (5792, 82)


## 3. Feature Engineering

### 3.1 Handling Missing Values

In [275]:
# Check Missing values
print(f'Total number of missing values is {train_data.isnull().sum().sum()}')
print(f'Total number of missing values is {test_data.isnull().sum().sum()}')

Total number of missing values is 43292
Total number of missing values is 19366


In [276]:
# Replace missing values

cleaned_train_data = train_data.fillna(method='bfill', axis=0)
cleaned_test_data = test_data.fillna(method='bfill', axis=0)

In [277]:
# Print the shape of the cleaned datasets
print(f'The sum of null values in the cleaned train dataset is {cleaned_train_data.isnull().sum().sum()}')
print(f'The sum of null values in the cleaned test dataset is {cleaned_test_data.isnull().sum().sum()}')

The sum of null values in the cleaned train dataset is 111
The sum of null values in the cleaned test dataset is 114


In [278]:
# Drop the remaining missing values
cleaned_train_data.dropna(how='any', axis=0, inplace=True)
cleaned_test_data.dropna(how='any', axis=0, inplace=True)

In [279]:
# Splitting the data
X_train = cleaned_train_data.drop(columns='DiagPeriodL90D', axis=1)
y_train = cleaned_train_data['DiagPeriodL90D']

### 3.2 One-Hot Encoding

In [280]:
string_columns_train = X_train.select_dtypes(include=['object']).columns.tolist()

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for column in string_columns_train:
    X_train[column] = label_encoder.fit_transform(X_train[column])


## 4. Model Creation

In [281]:
# Initialize SVM
svm = SVC()

classification_model = svm.fit(X_train, y_train)

## 5. Model Prediction and Accuracy

In [282]:
X_test = cleaned_test_data  # No need to drop 'DiagPeriodL90D' as it's not present in test data
string_columns_test = X_test.select_dtypes(include=['object']).columns.tolist()

# Apply label encoding to each categorical column
for column in string_columns_test:
    X_test[column] = label_encoder.fit_transform(X_test[column])


In [283]:
X_test.fillna(method='bfill', inplace=True)

# Make predictions on the test data
predictions_test = classification_model.predict(X_test)

# Round predictions to one decimal place
predictions_rounded = np.round(predictions_test, 1)

# Create a DataFrame to store patient IDs and rounded predictions
results_df = pd.DataFrame({'patient_id': X_test['patient_id'], 'prediction': predictions_rounded})

print("patient_id", "DiagPeriodL90D")
# Print predictions with patient IDs
for index, row in results_df.iterrows():
    print(f"{row['patient_id']}, {row['prediction']:.1f}")

# Save results to a CSV file
results_df.to_csv('predictions.csv', index=False)



patient_id DiagPeriodL90D
573710, 1.0
593679, 1.0
184532, 1.0
447383, 1.0
687972, 1.0
281312, 1.0
492714, 1.0
378266, 1.0
291550, 1.0
612272, 1.0
240105, 1.0
277939, 1.0
504153, 1.0
287269, 1.0
108727, 1.0
598629, 1.0
805201, 1.0
565624, 1.0
689369, 1.0
252028, 1.0
830503, 1.0
777454, 1.0
931410, 1.0
894910, 1.0
257477, 1.0
373935, 1.0
929645, 1.0
164064, 1.0
558677, 1.0
707003, 1.0
289528, 1.0
340932, 1.0
182933, 1.0
887761, 1.0
249345, 1.0
750357, 1.0
162816, 1.0
346740, 1.0
496264, 1.0
900330, 1.0
630418, 1.0
582166, 1.0
397291, 1.0
617035, 1.0
903270, 1.0
628977, 1.0
664844, 1.0
571761, 1.0
284488, 1.0
573502, 1.0
914179, 1.0
533295, 1.0
673470, 1.0
394209, 1.0
165379, 1.0
335343, 1.0
521842, 1.0
513091, 1.0
144195, 1.0
114345, 1.0
611289, 1.0
430995, 1.0
755430, 1.0
170191, 1.0
739107, 1.0
760626, 1.0
999890, 1.0
432271, 1.0
908248, 1.0
843808, 1.0
830973, 1.0
821535, 1.0
129401, 1.0
945227, 1.0
907148, 1.0
333105, 1.0
597610, 1.0
656576, 1.0
572936, 1.0
791246, 1.0
539117, 1.0
84

In [284]:
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np

# Make predictions on the test data
predictions = classification_model.predict(X_test)

# Obtain decision scores for test data
decision_scores_test = classification_model.decision_function(X_test)

# Convert decision scores to probabilities using a sigmoid function
predicted_probabilities_test = 1 / (1 + np.exp(-decision_scores_test))

# Calculate the ROC curve and AUC for the test data
fpr, tpr, thresholds = roc_curve(predictions, predicted_probabilities_test)
auc_score = roc_auc_score(predictions, predicted_probabilities_test)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()





ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.