In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

In [2]:
# Loading the dataset
data = pd.read_csv('Breast_Cancer_dataset.csv')

In [3]:
data.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [4]:
# Check for missing values
print("Missing values:", data.isnull().sum())

Missing values: Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64


In [5]:
# Perform label encoding for categorical variables
le = LabelEncoder()
categorical_columns = ['Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage', 'differentiate','Grade', 'A Stage', 'Estrogen Status', 'Progesterone Status']
for column in categorical_columns:
    data[column] = le.fit_transform(data[column])

In [6]:
data.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,2,1,0,0,0,1,3,1,4,1,1,24,1,60,Alive
1,50,2,1,1,1,2,0,2,1,35,1,1,14,5,62,Alive
2,58,2,0,2,2,4,0,2,1,63,1,1,14,7,75,Alive
3,58,2,1,0,0,0,1,3,1,18,1,1,2,1,84,Alive
4,47,2,1,1,0,1,1,3,1,41,1,1,3,1,50,Alive


In [7]:
# Identify numerical columns
numerical_columns = data.select_dtypes(include=[np.number]).columns.tolist()
numerical_columns 

['Age',
 'Race',
 'Marital Status',
 'T Stage ',
 'N Stage',
 '6th Stage',
 'differentiate',
 'Grade',
 'A Stage',
 'Tumor Size',
 'Estrogen Status',
 'Progesterone Status',
 'Regional Node Examined',
 'Reginol Node Positive',
 'Survival Months']

In [8]:
# Perform standardization on numerical features
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

In [9]:
data.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,1.565253,0.39716,-0.349103,-1.025287,-0.632209,-1.043706,0.304678,1.351806,0.152963,-1.253661,0.267652,0.458107,1.190676,-0.618172,-0.492961,Alive
1,-0.443222,0.39716,-0.349103,0.281158,0.809974,0.535491,-0.679291,-0.205054,0.152963,0.214345,0.267652,0.458107,-0.044095,0.164807,-0.405695,Alive
2,0.449434,0.39716,-1.289389,1.587604,2.252157,2.114687,-0.679291,-0.205054,0.152963,1.540287,0.267652,0.458107,-0.044095,0.556296,0.16153,Alive
3,0.449434,0.39716,-0.349103,-1.025287,-0.632209,-1.043706,0.304678,1.351806,0.152963,-0.590691,0.267652,0.458107,-1.52582,-0.618172,0.554224,Alive
4,-0.777968,0.39716,-0.349103,0.281158,-0.632209,-0.254108,0.304678,1.351806,0.152963,0.498475,0.267652,0.458107,-1.402343,-0.618172,-0.929288,Alive


In [10]:
# Calculating statistics before outlier handling
stats_before = data[:-1].describe()
stats_before 

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months
count,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0,4023.0
mean,0.000221,-9.9e-05,8.7e-05,-7e-05,0.000157,6.3e-05,0.000169,5.1e-05,-3.8e-05,6e-06,-6.7e-05,-0.000114,0.000226,0.000105,-0.000311
std,1.00015,1.000229,1.000233,1.000239,1.000199,1.000241,1.000191,1.000243,1.000246,1.000249,1.00024,1.000223,1.000146,1.000226,1.000054
min,-2.674862,-3.146457,-1.289389,-1.025287,-0.632209,-1.043706,-0.679291,-3.318774,-6.537517,-1.395727,-3.736189,-2.182898,-1.649297,-0.618172,-3.067291
25%,-0.777968,0.39716,-0.349103,-1.025287,-0.632209,-1.043706,-0.679291,-0.205054,0.152963,-0.685401,0.267652,0.458107,-0.66148,-0.618172,-0.667492
50%,0.003106,0.39716,-0.349103,0.281158,-0.632209,-0.254108,-0.679291,-0.205054,0.152963,-0.259205,0.267652,0.458107,-0.044095,-0.422427,0.074265
75%,0.78418,0.39716,-0.349103,0.281158,0.809974,0.535491,0.304678,1.351806,0.152963,0.35641,0.267652,0.458107,0.573291,0.164807,0.816021
max,1.676835,0.39716,2.471758,2.894049,2.252157,2.114687,2.272617,1.351806,0.152963,5.186625,0.267652,0.458107,5.759329,8.190338,1.557777


In [11]:
# Detect and remove outliers using z-score
z_scores = np.abs((data[numerical_columns] - data[numerical_columns].mean()) / data[numerical_columns].std())
threshold = 3
outliers = (z_scores > threshold).any(axis=1)
data = data[~outliers]

In [12]:
stats_after = data[:-1].describe()
stats_after 

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months
count,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0,3253.0
mean,0.024201,0.245742,-0.037504,-0.086719,-0.140103,-0.146579,-0.015951,-0.068655,0.1529633,-0.107532,0.2676524,0.143102,-0.098488,-0.169758,0.066391
std,0.994389,0.495411,0.967754,0.928349,0.868454,0.88623,1.031908,0.951238,9.910264e-15,0.827568,1.132602e-14,0.856111,0.90192,0.667774,0.940558
min,-2.674862,-1.374649,-1.289389,-1.025287,-0.632209,-1.043706,-0.679291,-1.761914,0.1529633,-1.395727,0.2676524,-2.182898,-1.649297,-0.618172,-2.980026
25%,-0.777968,0.39716,-0.349103,-1.025287,-0.632209,-1.043706,-0.679291,-0.205054,0.1529633,-0.685401,0.2676524,0.458107,-0.784957,-0.618172,-0.580226
50%,0.114688,0.39716,-0.349103,0.281158,-0.632209,-0.254108,-0.679291,-0.205054,0.1529633,-0.353915,0.2676524,0.458107,-0.167572,-0.422427,0.117897
75%,0.895761,0.39716,-0.349103,0.281158,0.809974,0.535491,0.304678,-0.205054,0.1529633,0.214345,0.2676524,0.458107,0.449814,-0.030938,0.816021
max,1.676835,0.39716,2.471758,2.894049,2.252157,2.114687,2.272617,1.351806,0.1529633,2.818873,0.2676524,0.458107,2.919356,2.905232,1.557777


In [13]:
print(data.dtypes)

Age                       float64
Race                      float64
Marital Status            float64
T Stage                   float64
N Stage                   float64
6th Stage                 float64
differentiate             float64
Grade                     float64
A Stage                   float64
Tumor Size                float64
Estrogen Status           float64
Progesterone Status       float64
Regional Node Examined    float64
Reginol Node Positive     float64
Survival Months           float64
Status                     object
dtype: object


In [14]:
# Check for missing values
print("Missing values:", data.isnull().sum())

Missing values: Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64


In [15]:
# Separate the feature columns and the target column
X = data.drop('Status', axis=1)  # Feature columns
y = data['Status']  # Target column

# Perform dimensionality reduction using PCA on the feature columns
n_components = min(X.shape[0], X.shape[1])
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

# Create a new DataFrame with the transformed data
data_pca = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])

# Add the target column to the transformed data
data_pca['Status'] = y.reset_index(drop=True)
data=data_pca

In [16]:
explained_variance_ratio = pca.explained_variance_ratio_

# Create a DataFrame to store the feature importance
feature_importance = pd.DataFrame({'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance_ratio))],
                                   'Importance': explained_variance_ratio})

# Sort the feature importance in descending order
feature_importance = feature_importance.sort_values('Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

Feature Importance:
  Principal Component  Importance
0                 PC1    0.244434
1                 PC2    0.134738
2                 PC3    0.105435
3                 PC4    0.097993
4                 PC5    0.091158
5                 PC6    0.085601
6                 PC7    0.072527
7                 PC8    0.067278
8                 PC9    0.053734


In [17]:
data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,Status
0,-1.258819,-0.879866,1.575909,-0.144755,0.505622,0.014222,-0.693731,1.083315,1.813397,Alive
1,1.275074,-0.214337,-0.065022,-0.4267,0.242161,0.341089,-0.408078,-0.452554,-0.565554,Alive
2,3.835247,0.498049,0.016711,-0.159031,1.001781,-1.057101,-0.563487,-0.878287,-0.55283,Alive
3,-1.550394,-0.888127,-0.015958,0.356871,0.328323,-0.408989,0.558797,-1.15823,1.345096,Alive
4,0.047103,-0.625382,-1.40256,0.651266,0.782143,1.052871,0.28323,-0.845968,1.118803,Alive


In [18]:
# Check for missing values
print("Missing values:", data.isnull().sum())

Missing values: PC1       0
PC2       0
PC3       0
PC4       0
PC5       0
PC6       0
PC7       0
PC8       0
PC9       0
Status    0
dtype: int64


In [19]:
# Print the value counts of the target variable
print("Target variable value counts:")
print(data['Status'].value_counts())

Target variable value counts:
Alive    2878
Dead      376
Name: Status, dtype: int64


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler 

# Split the dataset into features (X) and target variable (y)
X = data.drop('Status', axis=1)
y = data['Status']

# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Oversample the minority class using RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

class KNN:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X):
        predictions = []
        for _, x in X.iterrows():
            distances = np.sqrt(np.sum((self.X_train - x)**2, axis=1))
            indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[indices]
            prediction = np.bincount(k_nearest_labels).argmax()
            predictions.append(prediction)
        return predictions


models = {
    'KNN': KNN(k=5),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)

# Make predictions and evaluate the models
results = []
for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append([name, accuracy, precision, recall, f1])

# Display the results
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print("Original Models:")
print(results_df)

Original Models:
               Model  Accuracy  Precision    Recall  F1 Score
0                KNN  0.878472   0.819505  0.972366  0.889415
1        Naive Bayes  0.752604   0.756098  0.749568  0.752819
2      Decision Tree  0.950521   0.910377  1.000000  0.953086
3      Random Forest  0.985243   0.971477  1.000000  0.985532
4  Gradient Boosting  0.848090   0.840067  0.861831  0.850810


In [21]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier


scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Hyperparameter tuning for KNN
knn_params = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=StratifiedKFold(n_splits=5), scoring=scoring, refit='f1', n_jobs=-1)
knn_grid.fit(X_train, y_train)

print("Best parameters for KNN:", knn_grid.best_params_)
print("Best score for KNN:", knn_grid.best_score_)

Best parameters for KNN: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
Best score for KNN: 0.9154826377843225


In [22]:
# Hyperparameter tuning for Gradient Boosting
gb_params = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.15, 0.2],
    'max_depth': [1,2, 3, 4, 5],
}

gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_params, cv=StratifiedKFold(n_splits=5), scoring=scoring, refit='f1', n_jobs=-1)
gb_grid.fit(X_train, y_train)

print("Best parameters for Gradient Boosting:", gb_grid.best_params_)
print("Best score for Gradient Boosting:", gb_grid.best_score_)

Best parameters for Gradient Boosting: {'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 500}
Best score for Gradient Boosting: 0.9689691568521868


In [23]:
# Evaluate the models with best hyperparameters on the same test set
best_models = {
    'KNN': knn_grid.best_estimator_,
    'Gradient Boosting': gb_grid.best_estimator_
}

final_results = []
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    final_results.append([name, accuracy, precision, recall, f1])

# Display the final results
final_results_df = pd.DataFrame(final_results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print("Tuned Models:")
print(final_results_df)


if hasattr(gb_grid.best_estimator_, 'feature_importances_'):
    importances = gb_grid.best_estimator_.feature_importances_
    feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
    print("Feature Importances (Gradient Boosting):")
    print(feature_importances.sort_values('Importance', ascending=False))

Tuned Models:
               Model  Accuracy  Precision  Recall  F1 Score
0                KNN  0.922743   0.866766     1.0  0.928629
1  Gradient Boosting  0.977431   0.957025     1.0  0.978041
Feature Importances (Gradient Boosting):
  Feature  Importance
5     PC6    0.269867
4     PC5    0.170146
0     PC1    0.164686
2     PC3    0.090111
7     PC8    0.077364
6     PC7    0.064476
8     PC9    0.057666
3     PC4    0.056816
1     PC2    0.048869
