# Formative Assessment: Supervised Learning

## Objective:
### The objective of this assessment is to evaluate your understanding and ability to apply supervised learning techniques to a real-world dataset.


## 1. Loading and Preprocessing 

In [92]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer(as_frame=True)
df = data.frame

In [93]:
# Display the first few rows of the dataframe
print(df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [94]:
# Display the last few rows of the dataframe
print(df.tail())

     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     mean compactness  mean concavity  mean concave points  mean symmetry  \
564           0.11590         0.24390              0.13890         0.1726   
565           0.10340         0.14400              0.09791         0.1752   
566           0.10230         0.09251              0.05302         0.1590   
567           0.27700         0.35140              0.15200         0.2397   
568           0.04362         0.00000              0.00000         0.1587   

     mean fractal dimension  ...  worst texture  worst perimeter  wo

In [95]:
# Display summary statistics 
print("\nSummary statistics:")
print(df.describe())


Summary statistics:
       mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000000             

In [96]:
# Display information 
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [97]:
#the number of missing values in each column
print("\nNumber of missing values in each column:")
print(df.isnull().sum())


Number of missing values in each column:
mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [98]:
# Assign target column 
df['target'] = df.target

target_value_counts = df.groupby('target').size() / len(df)

print("\nTarget Value Count (Normalized):")
print(target_value_counts)


Target Value Count (Normalized):
target
0    0.372583
1    0.627417
dtype: float64


### EXPLANATION:
###### The breast cancer dataset consists of 569 samples, each with 30 features, categorized into two classes: malignant and benign. 
###### There are no missing values in this dataset.

#### Data Standardization: 
###### I used StandardScaler to normalize the features, ensuring they have a mean of zero and a standard deviation of one. 
###### This standardization helps many machine learning algorithms perform better. For readability, the scaled values were rounded 
###### to two decimal places.

### feature scaling.

In [99]:
from sklearn.preprocessing import StandardScaler
# Create a scaler object
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)   # scaling the dataset to increase the performance
# Print the scaled data
print("DF Scaled\n")
print(df_scaled.round(2))

DF Scaled

[[ 1.1  -2.07  1.27 ...  2.75  1.94 -1.3 ]
 [ 1.83 -0.35  1.69 ... -0.24  0.28 -1.3 ]
 [ 1.58  0.46  1.57 ...  1.15  0.2  -1.3 ]
 ...
 [ 0.7   2.05  0.67 ... -1.1  -0.32 -1.3 ]
 [ 1.84  2.34  1.98 ...  1.92  2.22 -1.3 ]
 [-1.81  1.22 -1.81 ... -0.05 -0.75  0.77]]


### SPLIT-TEST

In [100]:
# split the dataset 
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df_scaled,df.target,test_size=0.2,random_state=0)

## 2.Classification Algorithm Implementation 

In [101]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
lr_model = LogisticRegression(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(random_state=42)
knn_model = KNeighborsClassifier()

In [102]:
# Fit models to the scaled data using LogisticRegression
lr_model.fit(x_train, y_train)

In [103]:
# Fit models to the scaled data using DecisionTreeClassifier
dt_model.fit(x_train, y_train)

In [104]:
# Fit models to the scaled data using RandomForestClassifier
rf_model.fit(x_train, y_train)

In [105]:
# Fit models to the scaled data using SVC
svm_model.fit(x_train, y_train)

In [106]:
# Fit models to the scaled data using KNeighborsClassifier
dt_model.fit(x_train, y_train)

In [107]:
# Fit the models
lr_model.fit(x_train, y_train)
dt_model.fit(x_train, y_train)
rf_model.fit(x_train, y_train)
svm_model.fit(x_train, y_train)
knn_model.fit(x_train, y_train)

# Predictions on test set
y_lr = lr_model.predict(x_test)
y_dt = dt_model.predict(x_test)
y_rf = rf_model.predict(x_test)
y_svm = svm_model.predict(x_test)
y_knn = knn_model.predict(x_test)

# Print predictions
print("Logistic Regression predictions:", y_lr)
print("Decision Tree predictions:", y_dt)
print("Random Forest predictions:", y_rf)
print("SVM predictions:", y_svm)
print("KNN predictions:", y_knn)


Logistic Regression predictions: [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0
 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1
 0 0 1]
Decision Tree predictions: [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0
 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1
 0 0 1]
Random Forest predictions: [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0
 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1
 0 0 1]
SVM predictions: [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0
 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 1 1 0 

### Brief description and justification for each algorithm:

###### 1. Logistic Regression: A simple linear model that predicts the probability of a binary outcome. Ideal for this dataset since it handles binary 
###### classification problems well.
###### 2. Decision Tree Classifier:A model that splits data into subsets based on feature values, forming a tree structure. Good for this dataset because 
###### it handles non-linear relationships and is easy to interpret.
###### 3. Random Forest Classifier:An ensemble of multiple decision trees that improves accuracy and reduces overfitting. Suitable for better generalization and robustness.
###### 4. Support Vector Machine (SVM):A model that finds the best boundary to separate classes. Effective for high-dimensional data and is robust to noise.
###### 5. k-Nearest Neighbors (k-NN): A simple model that predicts the target class based on the closest neighbors. Useful for capturing non-linear relationships and is easy to understand.


## 3. Model Comparison

In [108]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize lists to store the results
results = []

# List of models and their names
models = [lr_model, dt_model, rf_model, svm_model, knn_model]
model_names = ["Logistic Regression", "Decision Tree", "Random Forest", "SVM", "k-NN"]

# Evaluate performance
for model, name in zip(models, model_names):
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append((name, acc, prec, rec, f1))
    print(f"{name}: Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1-score={f1:.4f}")

# Print all results in tabular form
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1 Score"])
print("\nOverall Performance:")
print(results_df)


Logistic Regression: Accuracy=1.0000, Precision=1.0000, Recall=1.0000, F1-score=1.0000
Decision Tree: Accuracy=1.0000, Precision=1.0000, Recall=1.0000, F1-score=1.0000
Random Forest: Accuracy=1.0000, Precision=1.0000, Recall=1.0000, F1-score=1.0000
SVM: Accuracy=1.0000, Precision=1.0000, Recall=1.0000, F1-score=1.0000
k-NN: Accuracy=0.9912, Precision=0.9853, Recall=1.0000, F1-score=0.9926

Overall Performance:
                 Model  Accuracy  Precision  Recall  F1 Score
0  Logistic Regression  1.000000   1.000000     1.0  1.000000
1        Decision Tree  1.000000   1.000000     1.0  1.000000
2        Random Forest  1.000000   1.000000     1.0  1.000000
3                  SVM  1.000000   1.000000     1.0  1.000000
4                 k-NN  0.991228   0.985294     1.0  0.992593


In [109]:
from sklearn.metrics import classification_report

# Evaluate performance for each model
for model, name in zip(models, model_names):
    y_pred = model.predict(x_test)
    report = classification_report(y_test, y_pred, target_names=["Benign", "Malignant"])
    print(f"{name}:\n{report}\n")

Logistic Regression:
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00        47
   Malignant       1.00      1.00      1.00        67

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114


Decision Tree:
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00        47
   Malignant       1.00      1.00      1.00        67

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114


Random Forest:
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00        47
   Malignant       1.00      1.00      1.00        67

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.

In [110]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score"])

# Find the best and worst models based on F1-score
best_model = results_df.loc[results_df["F1-score"].idxmax()]
worst_model = results_df.loc[results_df["F1-score"].idxmin()]

print(f"\nBest Model: {best_model['Model']} with F1-score={best_model['F1-score']:.4f}")
print(f"Worst Model: {worst_model['Model']} with F1-score={worst_model['F1-score']:.4f}")


Best Model: Logistic Regression with F1-score=1.0000
Worst Model: k-NN with F1-score=0.9926


### Conclusion:
###### All the models did a great job, but Logistic Regression, Decision Tree, Random Forest, and SVM are the best because they scored perfectly on all metrics. 
###### k-NN, while slightly less effective, still performed very well with a high F1-score. 
###### So, all these models are good choices for this dataset, but Logistic Regression, Decision Tree, Random Forest, and SVM are the top picks.
