In [3]:
# Step 1: Import libraries and dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
file_path = 'breast cancer.csv'
data = pd.read_csv(file_path)
print("Dataset loaded successfully.")
print(data.head())

Dataset loaded successfully.
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave_points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_wor

In [5]:
# Step 2: Data Preprocessing
# Handle missing values (if any)
print("Checking for missing values...")
print(data.isnull().sum())

Checking for missing values...
id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave_points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave_points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave_points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [7]:
# Dropping missing values (or you could fill them if appropriate)
data = data.dropna()
print("Missing values handled.")

Missing values handled.


In [9]:
# Splitting the dataset into features and target variable
X = data.drop(columns='diagnosis')
y = data['diagnosis']

In [11]:
# Encoding the target variable if it's not numeric (assuming 'diagnosis' is categorical)
y = y.map({'M': 1, 'B': 0})

In [13]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Dataset split into training and testing sets.")

Dataset split into training and testing sets.


In [15]:
# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("Features standardized.")

Features standardized.


In [17]:
# Step 3: K-NN Algorithm
# Choosing an appropriate value for 'k'
k = 5
knn = KNeighborsClassifier(n_neighbors=k)

In [19]:
# Applying the model to the training dataset
knn.fit(X_train, y_train)
print(f"K-NN model trained with k={k}.")

K-NN model trained with k=5.


In [21]:
# Step 4: Model Evaluation
# Using the testing set to assess the model's accuracy, precision, recall, and F1 score
y_pred = knn.predict(X_test)

In [23]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [25]:
print(f"Model Evaluation Metrics:\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}")

Model Evaluation Metrics:
Accuracy: 0.9473684210526315
Precision: 0.9302325581395349
Recall: 0.9302325581395349
F1 Score: 0.9302325581395349


In [27]:
# Adjust 'k' if needed for better results
# This part can be done by trying multiple values for 'k' and checking the metrics

In [29]:
# Step 5: Testing
# Test the model with a different set of data (Here, we use the test set itself for demonstration)
sample_data = X_test[:5]  # Replace this with any new data you want to test
sample_predictions = knn.predict(sample_data)
print("Sample predictions:", sample_predictions)

Sample predictions: [0 1 1 0 0]
