# Import Machine learning libraries 

In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Dataset Loading

In [78]:
# Load the dataset
data = pd.read_csv("cancer.csv")

In [79]:
# Check the first few rows of the dataset
print(data.head())

   Class  age  menopause  tumor-size  inv-nodes  node-caps  deg-malig  breast  \
0      0    5          1           1          1          2          1       3   
1      0    5          4           4          5          7         10       3   
2      0    3          1           1          1          2          2       3   
3      0    6          8           8          1          3          4       3   
4      0    4          1           1          3          2          1       3   

   breast-quad  irradiat  
0            1         1  
1            2         1  
2            1         1  
3            7         1  
4            1         1  


In [80]:
# Select features and target variable
features = data[['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']]
target = data['Class']

# EDA and pre processing

In [81]:
# Handle missing values
# drop rows with missing values
data = data.dropna()


In [82]:
data.isnull().sum()

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

# Apply appropriate ML model - (logistic regression)

In [83]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [84]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [85]:
# Initialize the model (Logistic Regression)
model = LogisticRegression(random_state=42)

In [86]:
# Train the model
model.fit(X_train, y_train)


In [87]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [88]:
y_pred

array([1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0], dtype=int64)

# Computing Evaluation matrix

In [89]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)


# Accuracy of Model

In [90]:
# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)


Accuracy: 0.9562043795620438
Precision: 0.9814814814814815
Recall: 0.9137931034482759
F1 Score: 0.9464285714285714
ROC-AUC Score: 0.9505674378000872


# Other Classifiers (Random forest, Desicion tree)

In [91]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_y_pred = dt_model.predict(X_test)

In [92]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)

In [93]:
# Print evaluation metrics for other models if desired
print("\nDecision Tree Metrics")
print("Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Precision:", precision_score(y_test, dt_y_pred))
print("Recall:", recall_score(y_test, dt_y_pred))
print("F1 Score:", f1_score(y_test, dt_y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, dt_y_pred))


Decision Tree Metrics
Accuracy: 0.9343065693430657
Precision: 0.9622641509433962
Recall: 0.8793103448275862
F1 Score: 0.9189189189189189
ROC-AUC Score: 0.9269969445656918


In [94]:
print("\nRandom Forest Metrics")
print("Accuracy:", accuracy_score(y_test, rf_y_pred))
print("Precision:", precision_score(y_test, rf_y_pred))
print("Recall:", recall_score(y_test, rf_y_pred))
print("F1 Score:", f1_score(y_test, rf_y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, rf_y_pred))


Random Forest Metrics
Accuracy: 0.9562043795620438
Precision: 0.9814814814814815
Recall: 0.9137931034482759
F1 Score: 0.9464285714285714
ROC-AUC Score: 0.9505674378000872


# Theory 

In [None]:
So in question there is asked to perform binary classification TO PREDICT WHETHER A TUMOR IS MALIGNANT OR BENIGN BASED ON 
PATIENT AND TUMER characteristics for binary classification there are different classification 
techniques are used , so I have used  Logistic Regression

    Steps involved in preparing model
    1)imported important python libraries 
    2)Load Dataset
    3)EDA and pre-processing(removing null value , standardising data,remove duplicates )
    3)Apply appropriate Ml model (Logistic regression)
       3.1)training data
       3.2)testing data
    4)Discuss computing evaluation matrix (confusion matrix)
    5)Accuracy of model
    6)Check accuracies of other classifiction techniques (random forest and decision tree)
    
Evaluation matrix score of Logistic regression is :-
Accuracy: 0.9562043795620438
Precision: 0.9814814814814815
Recall: 0.9137931034482759
F1 Score: 0.9464285714285714
ROC-AUC Score: 0.9505674378000872
    
    
Decision Tree Metrics
Accuracy: 0.9343065693430657
Precision: 0.9622641509433962
Recall: 0.8793103448275862
F1 Score: 0.9189189189189189
ROC-AUC Score: 0.9269969445656918
    
    


Random Forest Metrics
Accuracy: 0.9562043795620438
Precision: 0.9814814814814815
Recall: 0.9137931034482759
F1 Score: 0.9464285714285714
ROC-AUC Score: 0.9505674378000872
    