In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import os
import warnings
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder,FunctionTransformer
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
data=load_breast_cancer()
df=pd.DataFrame(data=data.data,columns=data.feature_names)
df['target'] = data.target

In [3]:
X = df.drop(columns='target')
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 398 entries, 149 to 102
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              398 non-null    float64
 1   mean texture             398 non-null    float64
 2   mean perimeter           398 non-null    float64
 3   mean area                398 non-null    float64
 4   mean smoothness          398 non-null    float64
 5   mean compactness         398 non-null    float64
 6   mean concavity           398 non-null    float64
 7   mean concave points      398 non-null    float64
 8   mean symmetry            398 non-null    float64
 9   mean fractal dimension   398 non-null    float64
 10  radius error             398 non-null    float64
 11  texture error            398 non-null    float64
 12  perimeter error          398 non-null    float64
 13  area error               398 non-null    float64
 14  smoothness error         398 

In [5]:
mean_imputation_col=['worst smoothness','worst texture','mean texture']
median_imputation_col=[col for col in X_train.columns if col not in mean_imputation_col]

In [6]:
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('mean_impute', mean_imputer, mean_imputation_col),
        ('median_impute', median_imputer, median_imputation_col)
    ]
)

In [8]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("scaler",StandardScaler())
])

In [9]:
X_train_processed=pipeline.fit_transform(X_train)
X_test_processed=pipeline.transform(X_test)

In [12]:
models = {
    "logistic_regression": LogisticRegression(),
    "random_forest": RandomForestClassifier()
}

# Initialize lists and dictionary for results
trained_models = []
report = {}
cross_val_score_list = []

for model_name, model in models.items():
    # Fit the model
    model.fit(X_train_processed, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_processed)
    
    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    # Calculate cross-validation score
    score = np.mean(cross_val_score(model, X_train_processed, y_train, cv=5, scoring='accuracy'))
    
    # Print evaluation metrics
    print(f"Evaluation for {model_name}:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(class_report)
    
    # Store results in dictionary
    report[model_name] = class_report
    cross_val_score_list.append(score)
    trained_models.append((model_name, model, recall))


Evaluation for logistic_regression:
Accuracy: 0.98
Precision: 0.99
Recall: 0.98

Confusion Matrix:
[[ 62   1]
 [  2 106]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98        63
           1       0.99      0.98      0.99       108

    accuracy                           0.98       171
   macro avg       0.98      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171

Evaluation for random_forest:
Accuracy: 0.96
Precision: 0.96
Recall: 0.98

Confusion Matrix:
[[ 59   4]
 [  2 106]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        63
           1       0.96      0.98      0.97       108

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [13]:
report

{'logistic_regression': '              precision    recall  f1-score   support\n\n           0       0.97      0.98      0.98        63\n           1       0.99      0.98      0.99       108\n\n    accuracy                           0.98       171\n   macro avg       0.98      0.98      0.98       171\nweighted avg       0.98      0.98      0.98       171\n',
 'random_forest': '              precision    recall  f1-score   support\n\n           0       0.97      0.94      0.95        63\n           1       0.96      0.98      0.97       108\n\n    accuracy                           0.96       171\n   macro avg       0.97      0.96      0.96       171\nweighted avg       0.96      0.96      0.96       171\n'}

In [14]:
cross_val_score_list

[0.9748101265822784, 0.9522468354430378]

In [15]:
best_model = max(trained_models, key=lambda x: x[2])
print(f"\nBest model based on recall (minimizing false negatives): {best_model[0]}")


Best model based on recall (minimizing false negatives): logistic_regression
