<a href="https://www.kaggle.com/code/mennatallah77/breast-cancer-with-5-models?scriptVersionId=165329861" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

![il_fullxfull.3860244894_p9az.avif](attachment:12f70bbc-4d08-49b6-b7c9-28369203cd73.avif)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,classification_report,ConfusionMatrixDisplay

In [None]:
data = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")

# *Having a peek on the data*

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

# *Dropping unnecessary columns*

In [None]:
data.drop(['Unnamed: 32','id'],axis=1,inplace=True)

In [None]:
y = data['diagnosis']
X = data.drop(['diagnosis'],axis=1)

# *Quick check on the target*

In [None]:
y.unique()

In [None]:
y.value_counts()

In [None]:
sns.histplot(y)

In [None]:
#map the diagnosis to 0 and 1
y = y.map({'M':1,'B':0})
y.value_counts()


In [None]:
X.head()

# *Visualizing some features*

In [None]:
sns.pairplot(data,hue='diagnosis',vars=['radius_mean','texture_mean','smoothness_mean','compactness_mean','concavity_mean'])

# *Dropping some features*

In [None]:
sns.heatmap(X.corr())

In [None]:
def high_corr(data,threshold):
    highly_corr = []
    for i in range(len(data.corr().columns)):
        for j in range(i):
            if abs(data.corr().iloc[i,j]) > threshold:
                print(f'({data.corr().columns[i]},{data.corr().columns[j]}) : {data.corr().iloc[i,j]}')
                highly_corr.append((data.corr().columns[i],data.corr().columns[j]))
    return highly_corr

high_corr_list = high_corr(X,0.95)


In [None]:
high_corr_list

In [None]:
X.drop(['perimeter_mean','area_mean','perimeter_se','area_se','perimeter_worst','area_worst'],axis=1,inplace=True)

In [None]:
X.shape

In [None]:
selector = SelectKBest(f_classif,k=X.shape[1])
selector.fit(X,y)
print(selector.scores_)
for i,score in enumerate(selector.scores_):
    plt.bar(i,score)
plt.xticks(range(X.shape[1]),X.columns,rotation=90)
plt.show()

In [None]:
X.drop(['fractal_dimension_mean','texture_se','smoothness_se','symmetry_se','fractal_dimension_se',],axis=1,inplace=True)

In [None]:
X.shape

In [None]:
X.describe()

In [None]:
X = StandardScaler().fit_transform(X)

# *Splitting the data*

In [None]:
X_,X_test,Y_,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_,Y_,test_size=0.3,random_state=42)
print(X_train.shape,X_val.shape,X_test.shape)

# *🌠Logistic Regression🌠*

In [None]:
model = LogisticRegression().fit(X_train,y_train)
y_pred = model.predict(X_val)
predection = model.predict(X_test)
print(f'Val Accuracy: {accuracy_score(y_val, y_pred)}')
print(f'Test Accuracy: {accuracy_score(y_test, predection)}')

In [None]:
print(classification_report(y_test,predection))
cm = confusion_matrix(y_test,predection)
ConfusionMatrixDisplay(cm,display_labels=['B','M']).plot()

# *Easy hypertuning using GirdSearch*

In [None]:
def grid_search(model,params,X_train,y_train):
    grid = GridSearchCV(model,params,scoring='f1')
    grid.fit(X_train,y_train)
    return grid.best_params_

# 🌠*🌠KNN🌠*

In [None]:
model = KNeighborsClassifier()
params = {'n_neighbors':range(1,10),'weights':['uniform','distance']}
best = grid_search(model,params,X_train,y_train)
best

In [None]:
model = KNeighborsClassifier(n_neighbors=best['n_neighbors'],weights=best['weights']).fit(X_train,y_train)
y_pred = model.predict(X_val)
predection = model.predict(X_test)
print(f'Val Accuracy: {accuracy_score(y_val, y_pred)}')
print(f'Test Accuracy: {accuracy_score(y_test, predection)}')

In [None]:
print(classification_report(y_test,predection))
cm = confusion_matrix(y_test,predection)
ConfusionMatrixDisplay(cm,display_labels=['B','M']).plot()

# *🌠Random Forest🌠*

In [None]:
model = RandomForestClassifier()
params = {'max_depth':[2,4,10,30,100], 'min_samples_split':[2,5,10,20,50], 'n_estimators':[10,50,100,200]}
best = grid_search(model,params,X_train,y_train)
best

In [None]:
model = RandomForestClassifier(max_depth=best['max_depth'],min_samples_split=best['min_samples_split'],n_estimators=best['n_estimators']).fit(X_train,y_train)
y_pred = model.predict(X_val)
predection = model.predict(X_test)
print(f'Val Accuracy: {accuracy_score(y_val, y_pred)}')
print(f'Test Accuracy: {accuracy_score(y_test, predection)}')

In [None]:
print(classification_report(y_test,predection))
cm = confusion_matrix(y_test,predection)
ConfusionMatrixDisplay(cm,display_labels=['B','M']).plot()

# *🌠XGBoost🌠*

In [None]:
model = XGBClassifier()
params = {'n_estimators':[10,50,100,200],'learning_rate':[0.01,0.1,0.3,0.5],'max_depth':[2,4,10,30,100]}
best = grid_search(model,params,X_train,y_train)
best

In [None]:
model = XGBClassifier(n_estimators = best['n_estimators'], learning_rate = best['learning_rate'],max_depth=best['max_depth']).fit(X_train,y_train)
y_pred = model.predict(X_val)
predection = model.predict(X_test)
print(f'Val Accuracy: {accuracy_score(y_val, y_pred)}')
print(f'Test Accuracy: {accuracy_score(y_test, predection)}')

In [None]:
print(classification_report(y_test,predection))
cm = confusion_matrix(y_test,predection)
ConfusionMatrixDisplay(cm,display_labels=['B','M']).plot()

# *🌠LGBM🌠*

In [None]:
model = LGBMClassifier()
params = {'n_estimators':[10,50,100],'learning_rate':[0.01,0.1,0.001],'max_depth':[2,4,10]}
best = grid_search(model,params,X_train,y_train)
best

In [None]:
model = LGBMClassifier(n_estimators = best['n_estimators'], learning_rate = best['learning_rate'],max_depth=best['max_depth']).fit(X_train,y_train)
y_pred = model.predict(X_val)
predection = model.predict(X_test)
print(f'Val Accuracy: {accuracy_score(y_val, y_pred)}')
print(f'Test Accuracy: {accuracy_score(y_test, predection)}')

In [None]:
print(classification_report(y_test,predection))
cm = confusion_matrix(y_test,predection)
ConfusionMatrixDisplay(cm,display_labels=['B','M']).plot()

# ***✨As a wrap up✨***

# *Model Used:*
* *Logistic regression*
* *KNN*
* *Random Forest*
* *XGBoost* 
* *LGBM*

# *Best Accuracy: 96.49%*