In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.model_selection import train_test_split, KFold

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df = pd.read_csv('/content/mammographic_mass.csv')
df = df.drop('id', axis = 1)
df

Unnamed: 0,BI-RADS>4,Age>60,Shape>2.5,Margin>1.5,Severity
0,True,True,True,True,True
1,True,False,True,True,True
2,False,False,False,False,False
3,True,False,False,True,True
4,True,True,False,True,True
...,...,...,...,...,...
825,False,False,False,False,False
826,False,False,True,True,True
827,False,True,True,True,False
828,True,True,True,True,True


In [None]:
X=df.drop(['Severity'], axis=1).values
y=df['Severity'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=45)

# KNN Classifier

In [None]:
param_grid={'n_neighbors':np.arange(1,50)}

In [None]:
knn = KNeighborsClassifier()

In [None]:
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False)
grid.fit(X, y)

In [None]:
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',

In [None]:
print(grid.best_score_)
print(grid.best_params_)

0.844578313253012
{'n_neighbors': 9}


In [None]:
from sklearn.metrics import classification_report

In [None]:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.86      0.87      0.87       137
        True       0.87      0.86      0.86       137

    accuracy                           0.86       274
   macro avg       0.86      0.86      0.86       274
weighted avg       0.86      0.86      0.86       274



# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

param_grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} # l1 lasso l2 ridge

logreg=LogisticRegression()

grid = GridSearchCV(logreg, param_grid, cv=10, scoring='accuracy', return_train_score=False)
grid.fit(X, y)

print(grid.best_score_)
print(grid.best_params_)

0.844578313253012
{'C': 0.1, 'penalty': 'l2'}


70 fits failed out of a total of 140.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.84457831        nan 0.8

In [None]:
logreg = LogisticRegression(C= 0.1, penalty = 'l2')
logreg.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.86      0.87      0.87       137
        True       0.87      0.86      0.86       137

    accuracy                           0.86       274
   macro avg       0.86      0.86      0.86       274
weighted avg       0.86      0.86      0.86       274



# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc=RandomForestClassifier()

In [None]:
param_grid = {
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
grid = GridSearchCV(estimator=rfc, param_grid = param_grid, cv= 10)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

0.844578313253012
{'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}


In [None]:
rfc = RandomForestClassifier(criterion = 'gini', max_depth = 4, max_features = 'auto', n_estimators = 200)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.87      0.87      0.87       137
        True       0.87      0.87      0.87       137

    accuracy                           0.87       274
   macro avg       0.87      0.87      0.87       274
weighted avg       0.87      0.87      0.87       274



# SVM

In [None]:
from sklearn.svm import SVC

In [None]:
param_grid = {'C': [1, 10], 'gamma': [0.001, 0.01, 1]}

In [None]:
svm = SVC()

In [None]:
grid = GridSearchCV(estimator=svm, param_grid = param_grid, cv= 10)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

0.8409638554216867
{'C': 1, 'gamma': 1}


In [None]:
svm = SVC(C = 1, gamma = 1)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.87      0.87      0.87       137
        True       0.87      0.87      0.87       137

    accuracy                           0.87       274
   macro avg       0.87      0.87      0.87       274
weighted avg       0.87      0.87      0.87       274



# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [None]:
gaussion_nb = GaussianNB()
gaussion_nb.fit(X_train, y_train)
y_pred = gaussion_nb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.86      0.87      0.87       137
        True       0.87      0.86      0.86       137

    accuracy                           0.86       274
   macro avg       0.86      0.86      0.86       274
weighted avg       0.86      0.86      0.86       274



In [None]:
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train, y_train)
y_pred = multinomial_nb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.77      0.92      0.84       137
        True       0.90      0.72      0.80       137

    accuracy                           0.82       274
   macro avg       0.83      0.82      0.82       274
weighted avg       0.83      0.82      0.82       274



In [None]:
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train, y_train)
y_pred = bernoulli_nb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.89      0.80      0.84       137
        True       0.81      0.90      0.85       137

    accuracy                           0.85       274
   macro avg       0.85      0.85      0.85       274
weighted avg       0.85      0.85      0.85       274



# Boosting

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()

In [None]:
param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [None]:
grid = GridSearchCV(estimator=xgb, param_grid = param_grid, cv= 10)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

0.8493975903614459
{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.6}


In [None]:
xgb = XGBClassifier(colsample_bytree = 0.6, gamma = 0.5, max_depth =  3, min_child_weight = 1, subsample =  0.6)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.87      0.87      0.87       137
        True       0.87      0.87      0.87       137

    accuracy                           0.87       274
   macro avg       0.87      0.87      0.87       274
weighted avg       0.87      0.87      0.87       274

