<a href="https://colab.research.google.com/github/LeibGit/-DI_Bootcamp/blob/main/day3_dc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Your Task
Exploratory Data Analysis
Use pandas to load the dataset and examine the first few rows.
Check and handle the missing values.
Drop any unnecessary column
Create a Countplot to display diagnosis from magma


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("data.csv")
df.describe()
df.info()
df.head()

In [None]:
df = df.dropna()

Data Preprocessing, Building Models and Evaluation :
counts of unique rows in the ‘diagnosis’ column
map categorical values to numerical values
Splitting the data into train and test
Implement logistic regression and print the accuracy.
Implement K Nearest Neighbours and print the accuracy.
Implement Random Forests and print the accuracy.
Implement Support Vector Machines (SVM) and print the accuracy.
Which is the best model ?

In [None]:
df.columns

In [None]:
from numpy.random.mtrand import rand
from sklearn.model_selection import train_test_split
import pandas as pd # Ensure pandas is imported if not already globally available

# The previous df.dropna() in another cell made the DataFrame empty.
# We need to reload the original data to have content to work with.
df_reloaded = pd.read_csv("data.csv")

# Now, correctly drop the 'Unnamed: 32' column which caused the issue,
# as it contains only NaN values and is not useful.
df_processed = df_reloaded.drop(columns=['Unnamed: 32'], errors='ignore')

# Define features (X) and target (y)
X = df_processed.drop(['id', 'diagnosis'], axis=1)
y = df_processed['diagnosis']

print(X.shape)
print(y.shape)

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.8, # Reverted to original train_size
    random_state=42
)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

knn_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

knn_pipe.fit(X_train, y_train)

y_pred = knn_pipe.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, pos_label="M"))
print("Recall:", recall_score(y_test, y_pred, pos_label="M"))
print("F1:", f1_score(y_test, y_pred, pos_label="M"))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.svm import SVC


pipe_svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', C=1.0, gamma='scale', probability=True))
])

pipe_svm.fit(X_train, y_train)
svm_pred = pipe_svm.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

# Define the eval_and_report function as it was missing
def eval_and_report(model_name, pipeline, X_test, y_test):
    y_pred = pipeline.predict(X_test)
    print(f"--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, pos_label="M"))
    print("Recall:", recall_score(y_test, y_pred, pos_label="M"))
    print("F1 Score:", f1_score(y_test, y_pred, pos_label="M"))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, pos_label="M"),
        'recall': recall_score(y_test, y_pred, pos_label="M"),
        'f1_score': f1_score(y_test, y_pred, pos_label="M")
    }

pipe_rf = Pipeline([
    ('rf', RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42
    ))
])

pipe_rf.fit(X_train, y_train)
rf_metrics = eval_and_report("Random Forest baseline", pipe_rf, X_test, y_test)


rf_param_grid = {
    'rf__n_estimators': [200, 300, 500],
    'rf__max_depth': [None, 5, 10, 20],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

grid_rf = GridSearchCV(
    pipe_rf,
    rf_param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
rf_gs_metrics = eval_and_report("Random Forest grid", best_rf, X_test, y_test)

print("Best RF params:", grid_rf.best_params_)