<a href="https://colab.research.google.com/github/NK19Coder/Breast_cancer/blob/main/Breast_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [2]:
data = pd.read_csv('/content/breast-cancer.csv')

In [9]:
print(data.columns)
print(data.head())

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')
         id  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302          1        17.99         10.38          122.80     1001.0   
1    842517          1        20.57         17.77          132.90     1326.0   
2  84300903          1        19.69         21.25          130.00     1203.0   
3  84348301          1        

In [10]:
data = data.dropna()

In [62]:
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])
categorical_columns = data.select_dtypes(include=['object']).columns
if 'id' in categorical_columns:
    categorical_columns = categorical_columns.drop('id')

for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

In [12]:
X = data.drop(columns=['diagnosis'])
y = data['diagnosis']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'k-NN': KNeighborsClassifier()
}

In [15]:
results = {}

for name, model in models.items():
    cv_results = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    results[name] = cv_results.mean()
    print(f'{name}: {cv_results.mean()}')

Logistic Regression: 0.21760000000000002
Decision Tree: 0.8841877442676676
Random Forest: 0.9394083452779105
Gradient Boosting: 0.9372016813968184
SVM: 0.010526315789473684
k-NN: 0.5225938153854515


In [16]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]


In [17]:
print(f'Best model: {best_model_name} with F1 score: {results[best_model_name]}')

Best model: Random Forest with F1 score: 0.9394083452779105


In [18]:
best_model.fit(X_train, y_train)

In [19]:
y_pred = best_model.predict(X_test)

In [20]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9649122807017544
Precision: 0.975609756097561
Recall: 0.9302325581395349
F1 Score: 0.9523809523809524


In [70]:
new_subject = {
    'id': 0,
    'radius_mean': 14.0,
    'texture_mean': 20.0,
    'perimeter_mean': 90.0,
    'area_mean': 700.0,
    'smoothness_mean': 0.1,
    'compactness_mean': 0.15,
    'concavity_mean': 0.2,
    'concave points_mean': 0.05,
    'symmetry_mean': 0.2,
    'fractal_dimension_mean': 0.06,
    'radius_se': 0.5,
    'texture_se': 1.0,
    'perimeter_se': 3.0,
    'area_se': 40.0,
    'smoothness_se': 0.005,
    'compactness_se': 0.025,
    'concavity_se': 0.03,
    'concave points_se': 0.01,
    'symmetry_se': 0.02,
    'fractal_dimension_se': 0.003,
    'radius_worst': 15.0,
    'texture_worst': 25.0,
    'perimeter_worst': 100.0,
    'area_worst': 800.0,
    'smoothness_worst': 0.12,
    'compactness_worst': 0.2,
    'concavity_worst': 0.25,
    'concave points_worst': 0.08,
    'symmetry_worst': 0.3,
    'fractal_dimension_worst': 0.08
}


new_subject_df = pd.DataFrame([new_subject])


column_names = new_subject_df.columns

prediction = best_model.predict(new_subject_df)


print(f'The prediction for the new subject is: {"Diagnoised with cancer" if prediction[0] == 1 else "Non-cancerous"}')

The prediction for the new subject is: Non-cancerous
