<a href="https://colab.research.google.com/github/Narayanan7669/Ml/blob/main/Project4_Breast_Cancer_Diagnosis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1️⃣ Imports and Setup ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,
roc_auc_score, roc_curve, classification_report)
from sklearn.decomposition import PCA


# Optional: Handle imbalance
!pip install -q imbalanced-learn
from imblearn.over_sampling import SMOTE


# --- 2️⃣ Load Dataset ---
df = pd.read_csv('data.csv')
print('Shape:', df.shape)
df.head()


# --- Identify Target Column ---
possible_targets = ['diagnosis','target','label','class']
target_col = [c for c in possible_targets if c in df.columns]
target_col = target_col[0] if target_col else df.columns[-1]


X = df.drop(columns=[target_col, 'Unnamed: 32'])
y = df[target_col]


if y.dtype == 'object':
    y = LabelEncoder().fit_transform(y)


# --- Basic Preprocessing ---
num_cols = X.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='mean')
X[num_cols] = imputer.fit_transform(X[num_cols])
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Shape: (569, 33)


In [7]:
from google.colab import drive
drive.mount('/content/data.csv')

ValueError: Mountpoint must either be a directory or not exist


## 🔍 Q1. What features are most predictive?

**Explanation:** Features highly correlated with the diagnosis (target) are most predictive.

In [None]:
import seaborn as sns

correlation = df.drop('diagnosis', axis=1).corr()
plt.figure(figsize=(10,6))
sns.heatmap(correlation, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

## ⚙️ Q2. How does KNN work?
**Explanation:** KNN predicts the class of a data point by looking at its *k nearest neighbors* in feature space.


In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

## 📊 Q3. What is accuracy, precision, recall?
**Explanation:**
- **Accuracy** = correct predictions / total predictions
- **Precision** = true positives / (true positives + false positives)
- **Recall** = true positives / (true positives + false negatives)

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred_knn))
print('Precision:', precision_score(y_test, y_pred_knn))
print('Recall:', recall_score(y_test, y_pred_knn))

## ⚖️ Q4. How to choose K in KNN?
**Explanation:** The K with the lowest error rate is chosen for best performance.

In [None]:
error_rate = []
for k in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    pred_k = knn.predict(X_test)
    error_rate.append(np.mean(pred_k != y_test))


plt.figure(figsize=(8,5))
plt.plot(range(1,21), error_rate, marker='o', linestyle='--')
plt.xlabel('K Value')
plt.ylabel('Error Rate')
plt.title('Choosing Best K')
plt.show()

## 📈 Q5. What is ROC-AUC?
**Explanation:** ROC-AUC measures how well the model separates the two classes (1 = perfect, 0.5 = random).


In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print('Before:', pd.Series(y).value_counts())
print('After:', pd.Series(y_res).value_counts())

## ⚖️ Q6. How to handle imbalanced data?
**Explanation:** SMOTE balances data by generating synthetic samples for minority classes.

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print('Before:', pd.Series(y).value_counts())
print('After:', pd.Series(y_res).value_counts())

## 🔧 Q7. What preprocessing is needed?
**Explanation:** Preprocessing includes removing nulls, scaling features, and encoding categorical data if any.

In [None]:
# Handling missing values
print(df.isnull().sum())

## 🌈 Q8. How to visualize decision boundaries?
**Explanation:** Decision boundaries show how the classifier divides regions of different predicted classes.

In [None]:
from mlxtend.plotting import plot_decision_regions


# Use only 2 features for visualization
X_vis = X_train.iloc[:, :2].to_numpy()
y_vis = y_train # Use the original integer labels for plotting
knn_vis = KNeighborsClassifier(n_neighbors=5)
knn_vis.fit(X_vis, y_vis)


plot_decision_regions(X_vis, y_vis, clf=knn_vis, legend=2)
plt.title('KNN Decision Boundary')
plt.show()

## 🔁 Q9. What is overfitting?
**Explanation:** Overfitting occurs when a model performs well on training data but poorly on unseen data.



In [None]:
train_acc = knn.score(X_train, y_train)
test_acc = knn.score(X_test, y_test)
print('Train Accuracy:', train_acc)
print('Test Accuracy:', test_acc)

## ⚔️ Q10. How to compare models?

**Explanation:** Models can be compared using accuracy, precision, recall, and AUC scores.

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)


print('KNN Accuracy:', accuracy_score(y_test, y_pred_knn))
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred_log))

## Q11. How to check feature importance in Logistic Regression?

**Explanation:** Coefficients indicate the strength of each feature in predicting malignancy.

In [None]:
importance = pd.Series(log_reg.coef_[0], index=X.columns)
importance.nlargest(10).plot(kind='barh')
plt.title('Top 10 Important Features (Logistic Regression)')
plt.show()

## Q12. How to display the confusion matrix?

**Explanation:** Confusion matrix shows how many predictions are correct or incorrect for each class.

In [None]:
cm = confusion_matrix(y_test, y_pred_log)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## Q13. How to print full classification report?

**Explanation:** Displays precision, recall, F1-score, and accuracy in a single table.



In [None]:
print(classification_report(y_test, y_pred_log))

## Q14. How to detect outliers in the data?

**Explanation:** Boxplots help visualize outliers that might distort model training.

In [None]:
sns.boxplot(data=X)
plt.title('Outlier Detection')
plt.show()


## Q15. How to save and load the trained model?

**Explanation:** Saving models allows reusing them for future predictions without retraining.


In [None]:
import joblib
joblib.dump(log_reg, 'breast_cancer_model.pkl')
model = joblib.load('breast_cancer_model.pkl')
print('Model loaded successfully!')