In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [37]:
df = pd.read_csv('data.csv')
df_Original = df.copy()
df.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [39]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['diagnosis'] = le.fit_transform(df['diagnosis'])

In [40]:
df.drop(['texture_se', 'symmetry_se', 'fractal_dimension_se'], axis=1, inplace=True)
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier


# Function to print all relevant metrics
def print_metrics(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    print(f"{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("-" * 30)

# Initialize and fit the XGBoost model
xgb_model = XGBClassifier(n_estimators=3, max_depth=4, learning_rate=0.8, objective='binary:logistic')
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
print_metrics(y_test, xgb_preds, "XGBoost")

# Initialize and fit the LightGBM model
lgbm_model = LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100)
lgbm_model.fit(X_train, y_train)
lgbm_preds = lgbm_model.predict(X_test)
lgbm_preds
print_metrics(y_test, lgbm_preds, "LightGBM")

# Initialize and fit the Bagging Classifier model
base_classifier = LogisticRegression(solver='liblinear')
bagging_classifier = BaggingClassifier(base_estimator=base_classifier, n_estimators=10, random_state=123)
cv_scores = cross_val_score(bagging_classifier, X_train, y_train, cv=5, scoring='accuracy')
mean_cv_score = cv_scores.mean()
print(f'Bagging Classifier Mean Accuracy (Cross-Validation): {mean_cv_score:.2f}')




XGBoost Metrics:
Accuracy: 0.9591
Precision: 0.9516
Recall: 0.9365
F1 Score: 0.9440
Confusion Matrix:
[[105   3]
 [  4  59]]
------------------------------
[LightGBM] [Info] Number of positive: 149, number of negative: 249
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000443 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3579
[LightGBM] [Info] Number of data points in the train set: 398, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.374372 -> initscore=-0.513507
[LightGBM] [Info] Start training from score -0.513507
LightGBM Metrics:
Accuracy: 0.9591
Precision: 0.9375
Recall: 0.9524
F1 Score: 0.9449
Confusion Matrix:
[[104   4]
 [  3  60]]
------------------------------




Bagging Classifier Mean Accuracy (Cross-Validation): 0.94




#**- Blending**

---
**Stacked Model**: An ensemble of models in which the meta-model is trained on out-of-fold predictions made by the base models during k-fold cross validation.

**Blended Model**: An ensemble of models in which the meta-model is trained on predictions made by the base models on a holdout dataset (e.g., the validation dataset).


In [42]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=12345)

In [43]:
models = [('dtc', DecisionTreeClassifier()),
        ('gnb', ExtraTreesClassifier()),
        ('ada', AdaBoostClassifier()),
        ('rf', RandomForestClassifier())]

In [44]:
def fit_models(models, X_train, X_valid, y_train, y_valid):

    #Create variable in which to store predictions for meta-model.
    preds_for_meta = []

    #Loop through models in model list.
    for name, model in tqdm(models):

        #Fit model and obtain predictions.
        model.fit(X_train, y_train)
        pred = model.predict_proba(X_valid)[:, 1]

        #Obtain base moedl roc score.
        roc_base = roc_auc_score(y_valid, pred)

        print(f'{model} score: {roc_base}')

        #Reshape prediction into single-column matrix.
        pred = pred.reshape(len(pred), 1)

        #Append prediction to varible for meta-model.
        preds_for_meta.append(pred)

    #Create 2D array from predictions.
    meta_features = np.hstack(preds_for_meta)

    #Define blender for model.
    meta_model = xgb.XGBClassifier(n_estimators=7000,
                                 #tree_method='gpu_hist',
                                 #gpu_id = 0,
                                 random_state = 5,
                                 learning_rate=.03)

    #Fit meta model on predictions from base models.
    meta_model.fit(meta_features, y_valid.values.ravel(),
                 verbose=False,
                 eval_set=[(meta_features, y_valid.values.ravel())],
                 eval_metric='auc',
                 early_stopping_rounds=300)

    print(f'Meta AUC: {roc_auc_score(y_valid, meta_model.predict_proba(meta_features)[:, 1])}')

    return meta_model

def meta_predict(models, meta_model, X_test, threshold=0.5):
    preds_for_meta = []

    for name, model in tqdm(models):
        pred = model.predict(X_test)

        pred = pred.reshape(len(pred), 1)

        preds_for_meta.append(pred)

    meta_features = np.hstack(preds_for_meta)

    meta_preds = meta_model.predict_proba(meta_features)[:, 1]

    binary_preds = (meta_preds >= threshold).astype(int)

    return binary_preds


In [45]:
meta_model = fit_models(models, X_train, X_valid, y_train, y_valid)

  0%|          | 0/4 [00:00<?, ?it/s]

DecisionTreeClassifier() score: 0.8574168797953964


 50%|█████     | 2/4 [00:00<00:00, 10.95it/s]

ExtraTreesClassifier() score: 0.9879582267689685
AdaBoostClassifier() score: 0.9731457800511509


100%|██████████| 4/4 [00:00<00:00,  6.35it/s]

RandomForestClassifier() score: 0.9757033248081841
Meta AUC: 0.9995737425404945





In [46]:

test_pred = meta_predict(models, meta_model, X_valid)


100%|██████████| 4/4 [00:00<00:00, 108.01it/s]


In [47]:
test_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1])

# Voting Classifier

---
- Using RF, LR and DT


In [48]:
# # with the following function we can select highly correlated features
# # it will remove the first feature that is correlated with anything other feature

# def correlation(dataset, threshold):
#     col_corr = set()  # Set of all the names of correlated columns
#     corr_matrix = dataset.corr()
#     for i in range(len(corr_matrix.columns)):
#         for j in range(i):
#             if corr_matrix.iloc[i, j] > threshold: # we are interested in absolute coeff value
#                 colname = corr_matrix.columns[i]  # getting the name of column
#                 col_corr.add(colname)
#     return col_corr

# corr_features = correlation(X_train, 0.7)

# print(len(set(corr_features)))

# print(corr_features)

# #Effect
# X_train.drop(corr_features,axis=1, inplace=True)
# X_test.drop(corr_features,axis=1, inplace=True)
# X_train.columns

from sklearn.feature_selection import VarianceThreshold

var_thres=VarianceThreshold(threshold=0.7)
var_thres.fit(X_train)
print(var_thres.get_support())
sum(var_thres.get_support())
selected_features = X_train.columns[var_thres.get_support()]
print(selected_features)

constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print("Number of columns to be eliminated: ", len(constant_columns))
for column in constant_columns:
    print(column)

#Effect
X_train.drop(constant_columns,axis=1, inplace=True)
X_test.drop(constant_columns,axis=1, inplace=True)
X_train.columns

from sklearn.feature_selection import VarianceThreshold

var_thres=VarianceThreshold(threshold=0.7)
var_thres.fit(X_train)
print(var_thres.get_support())
sum(var_thres.get_support())
selected_features = X_train.columns[var_thres.get_support()]
print(selected_features)

constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print("Number of columns to be eliminated: ", len(constant_columns))
for column in constant_columns:
    print(column)

#Effect
X_train.drop(constant_columns,axis=1, inplace=True)
X_test.drop(constant_columns,axis=1, inplace=True)
X_train.columns

[ True  True  True  True False False False False False False False  True
  True False False False False  True  True  True  True False False False
 False False False]
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'perimeter_se', 'area_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst'],
      dtype='object')
Number of columns to be eliminated:  17
smoothness_mean
compactness_mean
concavity_mean
concave points_mean
symmetry_mean
fractal_dimension_mean
radius_se
smoothness_se
compactness_se
concavity_se
concave points_se
smoothness_worst
compactness_worst
concavity_worst
concave points_worst
symmetry_worst
fractal_dimension_worst
[ True  True  True  True  True  True  True  True  True  True]
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'perimeter_se', 'area_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst'],
      dtype='object')
Number of columns to be eliminated:  0


Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'perimeter_se', 'area_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst'],
      dtype='object')

In [53]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
# print_metrics(y_test, rf_preds, "Random Forest")
print(classification_report(y_test, rf_preds))
cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy').mean()


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       108
           1       1.00      0.97      0.98        63

    accuracy                           0.99       171
   macro avg       0.99      0.98      0.99       171
weighted avg       0.99      0.99      0.99       171



0.9437209302325582

In [54]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
# print_metrics(y_test, dt_preds, "Decision Tree")
print(classification_report(y_test, dt_preds))
cross_val_score(dt, X_train, y_train, cv=5, scoring='accuracy').mean()


              precision    recall  f1-score   support

           0       0.99      0.97      0.98       108
           1       0.95      0.98      0.97        63

    accuracy                           0.98       171
   macro avg       0.97      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171



0.9132147742818058

In [55]:
# prompt: Employ Logistic regression classification

lr = lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
# print_metrics(y_test, lr_preds, "Logistic Regression")
print(classification_report(y_test, lr_preds))
cross_val_score(lr, X_train, y_train, cv=5, scoring='accuracy').mean()

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       108
           1       0.97      0.95      0.96        63

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171



0.9320109439124487

In [59]:
vc = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('lr', lr)], voting='soft')
cv_scores = cross_val_score(vc, X_train, y_train, cv=5, scoring='accuracy')
print(classification_report(y_test, vc.fit(X_train, y_train).predict(X_test)))
cross_val_score(vc, X_train, y_train, cv=5, scoring='accuracy').mean()

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       108
           1       1.00      1.00      1.00        63

    accuracy                           1.00       171
   macro avg       1.00      1.00      1.00       171
weighted avg       1.00      1.00      1.00       171



0.94374829001368

# - **Support Vector Machine (SVM):**

In [61]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
# accuracy
print(classification_report(y_test, pred))
cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       108
           1       0.98      0.94      0.96        63

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



0.9319562243502052