## Libraries Importing

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

## Data Importing

In [4]:
df = pd.read_csv('Data/final_data.csv')
df.head()

Unnamed: 0,texture_mean,perimeter_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,...,fractal_dimension_se,texture_worst,perimeter_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_encoded
0,10.38,122.8,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,...,0.006193,17.33,184.6,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,17.77,132.9,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,...,0.003532,23.41,158.8,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,21.25,130.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,...,0.004571,25.53,152.5,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,20.38,77.58,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,...,0.009208,26.5,98.87,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,14.34,135.1,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,...,0.005115,16.67,152.2,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


## Split Data (Target, Features)

In [6]:
X = df.drop('diagnosis_encoded',axis=1)
print('Features shape:',X.shape)
y= df['diagnosis_encoded']
print('Target shape:',y.shape)

Features shape: (569, 24)
Target shape: (569,)


## Split Data (Train, test)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=42)
print('X_train shape:',X_train.shape)
print('X_test shape:',X_test.shape)
print('y_train shape:',y_train.shape)
print('y_test shape:',y_test.shape)

X_train shape: (455, 24)
X_test shape: (114, 24)
y_train shape: (455,)
y_test shape: (114,)


## Data scaling

Since our dataset includes measurements with different ranges, like radius, texture, perimeter, and area. Standardizing these values will ensure that each feature contributes more equally to the model's training process.

In [11]:
# Intialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training dataset and transform it

X_training_scaled = scaler.fit_transform(X_train)

# transform the testing data

X_test_scaled = scaler.transform(X_test)

## Logistic model

In [37]:
#Initialize the model

model = LogisticRegression(max_iter=500)

#Fit the data to the model

model.fit(X_training_scaled,y_train)

# make prediction using X_test

y_pred = model.predict(X_test_scaled)

In [38]:
# check accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")

Model accuracy: 0.9737


In [46]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



## Random forest model

In [41]:
#Initialize the model

model2 = RandomForestClassifier(random_state=42)

#Fit the data to the model

model2.fit(X_training_scaled,y_train)

# make prediction using X_test

y_pred2 = model2.predict(X_test_scaled)

In [42]:
accuracy = accuracy_score(y_test, y_pred2)
print(f"Model accuracy: {accuracy:.4f}")

Model accuracy: 0.9649


In [47]:
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

