# Triple Breast Classification Model

## 1. Import Necessary Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import RFE, RFECV
from sklearn import set_config
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier

import numpy as np


## 2. Load Data

In [2]:
train_data = pd.read_csv('training.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Checking a sample of the dataset
train_data.sample(5)

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02,DiagPeriodL90D
7741,642807,White,,MT,591,70,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,12.9,10.125,0.55,18.075,5.575,10.05,41.111066,5.94565,8.393097,0
3462,947214,White,MEDICARE ADVANTAGE,KY,402,69,F,,C50112,Malignant neoplasm of central portion of left ...,...,14.540625,18.628125,2.4375,21.634375,5.9875,6.6125,38.654161,8.346408,20.491883,1
7653,146339,White,MEDICAID,PA,166,40,F,27.29,1749,"Malignant neoplasm of breast (female), unspeci...",...,18.271212,10.484848,0.187879,31.132258,5.663636,8.848485,40.358717,6.89074,10.237538,1
6005,578660,,COMMERCIAL,CA,926,66,F,,C50012,"Malignant neoplasm of nipple and areola, left ...",...,8.845238,8.688095,5.280952,27.561905,4.404762,4.809524,42.070075,7.229393,15.894123,1
1511,188150,White,COMMERCIAL,CA,926,64,F,32.92,C50511,Malig neoplm of lower-outer quadrant of right ...,...,8.845238,8.688095,5.280952,27.561905,4.404762,4.809524,42.070075,7.229393,15.894123,1


In [4]:
test_data.sample(5)

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,hispanic,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02
3516,694516,,COMMERCIAL,IL,600,50,F,,C50411,Malig neoplm of upper-outer quadrant of right ...,...,15.716923,9.207692,7.010769,4.478462,30.3875,5.933846,6.790769,37.831775,7.686577,22.917447
294,502155,Other,COMMERCIAL,MO,655,58,F,,C50112,Malignant neoplasm of central portion of left ...,...,2.16129,21.629032,16.503226,0.13871,26.132,14.558065,12.422581,38.199809,6.62873,5.23743
4216,951296,Hispanic,,CA,917,41,F,30.58,1749,"Malignant neoplasm of breast (female), unspeci...",...,47.726087,9.895652,10.515217,12.745652,32.530435,7.263043,3.81087,47.310325,9.595719,20.084231
1876,989293,Hispanic,MEDICAID,CA,913,57,F,,C50412,Malig neoplasm of upper-outer quadrant of left...,...,35.332432,9.945946,9.222222,7.658333,31.824324,6.562162,3.82973,44.976559,8.276922,15.799885
2164,475383,,COMMERCIAL,TX,780,46,F,40.82,C50911,Malignant neoplasm of unsp site of right femal...,...,57.173585,15.671698,15.677358,9.660377,28.72,17.84717,9.024528,36.45954,7.519831,9.140613


In [5]:
# Print the shape of the datasets
print(f'The shape of the train dataset is {(len(train_data), len(train_data.columns))}')
print(f'The shape of the test dataset is {(len(test_data), len(test_data.columns))}')

The shape of the train dataset is (12906, 83)
The shape of the test dataset is (5792, 82)


## 3. Feature Engineering

### 3.1 Handling Missing Values

In [6]:
# Check Missing values
print(f'Total number of missing values is {train_data.isnull().sum().sum()}')
print(f'Total number of missing values is {test_data.isnull().sum().sum()}')

Total number of missing values is 43292
Total number of missing values is 19366


In [7]:
# Replace missing values

cleaned_train_data = train_data.fillna(method='bfill', axis=0)
cleaned_test_data = test_data.fillna(method='bfill', axis=0)

In [8]:
# Print the shape of the cleaned datasets
print(f'The sum of null values in the cleaned train dataset is {cleaned_train_data.isnull().sum().sum()}')
print(f'The sum of null values in the cleaned test dataset is {cleaned_test_data.isnull().sum().sum()}')

The sum of null values in the cleaned train dataset is 111
The sum of null values in the cleaned test dataset is 114


In [9]:
# Drop the remaining missing values
cleaned_train_data.fillna(method='ffill', axis=0, inplace=True)
cleaned_test_data.fillna(method='ffill', axis=0, inplace=True)

In [10]:
# Splitting the data
X = cleaned_train_data.drop(columns='DiagPeriodL90D', axis=1)
y = cleaned_train_data['DiagPeriodL90D']

### 3.2 One-Hot Encoding

In [11]:
string_columns_train = X.select_dtypes(include=['object']).columns.tolist()

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for column in string_columns_train:
    X[column] = label_encoder.fit_transform(X[column])


In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### 3.3 Feature Scaling

* Standardization

In [13]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val) # transform is used because the transformation has been learnt from the training data

## 4. Model Creation Pipeline

In [14]:
# Create the pipeline
svc = SVC(kernel="linear", C=1)

stacked_model = Pipeline([
    ("feature_selection", RFE(estimator=svc, step=1, n_features_to_select=1)),
    ("classifier", svc)
])

In [15]:
set_config(display='diagram')
stacked_model

In [16]:
# Fitting the model
classification_model = stacked_model.fit(X_train, y_train)

### 4.1 Cross-Validation

In [None]:

'''
from sklearn.model_selection import cross_val_score

#Perform 5-fold cross-validation
cv_scores = cross_val_score(classification_model, X_train, y_train, cv=2)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())
'''


KeyboardInterrupt: 

## 5. Model Prediction and Accuracy

In [None]:
# Make predictions on the test set
y_pred = stacked_model.predict(X_val)
y_pred = np.round(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f"RFC Accuracy: {accuracy * 100:.2f}%")

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_val, y_pred)
print(f"ROC AUC Score: {roc_auc:.4f}")

NotFittedError: This RFE instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
X_test = cleaned_test_data  # No need to drop 'DiagPeriodL90D' as it's not present in test data
string_columns_test = X_test.select_dtypes(include=['object']).columns.tolist()

# Apply label encoding to each categorical column
for column in string_columns_test:
    X_test[column] = label_encoder.fit_transform(X_test[column])


In [None]:
# Make predictions on the test data
predictions_test = stacked_model.predict(X_test)

# Create a DataFrame to store patient IDs and rounded predictions
results_df = pd.DataFrame({'patient_id': X_test['patient_id'], 'prediction': predictions_test})

print("patient_id", "DiagPeriodL90D")
# Print predictions with patient IDs
for index, row in results_df.iterrows():
    print(f"{row['patient_id']}, {int(row['prediction'])}")

# Save results to a CSV file
results_df.to_csv('predictions.csv', index=False)



patient_id DiagPeriodL90D
573710, 1.0
593679, 0.0
184532, 0.0
447383, 1.0
687972, 1.0
281312, 1.0
492714, 1.0
378266, 1.0
291550, 1.0
612272, 1.0
240105, 1.0
277939, 1.0
504153, 1.0
287269, 1.0
108727, 0.0
598629, 1.0
805201, 1.0
565624, 1.0
689369, 0.0
252028, 1.0
830503, 1.0
777454, 1.0
931410, 1.0
894910, 0.0
257477, 0.0
373935, 1.0
929645, 1.0
164064, 1.0
558677, 1.0
707003, 1.0
289528, 0.0
340932, 0.0
182933, 1.0
887761, 1.0
249345, 1.0
750357, 0.0
162816, 1.0
346740, 1.0
496264, 0.0
900330, 1.0
630418, 1.0
582166, 0.0
397291, 0.0
617035, 0.0
903270, 1.0
628977, 0.0
664844, 1.0
571761, 1.0
284488, 1.0
573502, 1.0
914179, 0.0
533295, 1.0
673470, 0.0
394209, 1.0
165379, 0.0
335343, 1.0
521842, 1.0
513091, 1.0
144195, 0.0
114345, 0.0
611289, 1.0
430995, 1.0
755430, 0.0
170191, 1.0
739107, 0.0
760626, 1.0
999890, 1.0
432271, 1.0
908248, 1.0
843808, 1.0
830973, 0.0
821535, 1.0
129401, 0.0
945227, 1.0
907148, 1.0
333105, 0.0
597610, 1.0
656576, 1.0
572936, 1.0
791246, 1.0
539117, 1.0
84