In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score

test_size = 0.3
eta0 = 0.1

In [2]:
column_names = [
    "sample_code_number", 
    "clump_thickness", 
    "uniformity_of_cell_size", 
    "uniformity_of_cell_shape", 
    "marginal_adhesion", 
    "single_epithelial_cell_size",
    "bare_nuclei",
    "bland_chromatin",
    "normal_nucleoli",
    "mitoses",
    "class"
]

raw_df = pd.read_csv("breast-cancer-wisconsin.data", names = column_names, na_values=["?"]).dropna(how='any')
cleaned_df = raw_df.apply(pd.to_numeric, errors='raise')

In [59]:
cleaned_df["clump_thickness"].min(), cleaned_df["clump_thickness"].max()

(1, 10)

In [4]:
#feature engineering
cleaned_df["class"] = cleaned_df["class"].apply(lambda x: 0 if x==4 else 1)

y = cleaned_df.pop('class')
X = cleaned_df

sc = StandardScaler()
sc.fit(X)

X_std = sc.transform(X)

In [5]:
random_seed_array = range(1, 21)

accuracy_array = []
precision_array = []

model = Perceptron(eta0 = eta0)
for random_state in random_seed_array:
    X_train, X_test, y_train, y_test = train_test_split(
        X_std, 
        y, 
        test_size = test_size,
        random_state = random_state
    )

    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    accuracy_array.append(round(accuracy_score(y_test, y_pred) * 100, 2))
    precision_array.append(round(precision_score(y_test, y_pred) * 100, 2))

print('Acurácias obtidas: {0}'.format(accuracy_array))
print('Precisões obtidas: {0}'.format(precision_array))
print('Desvio padrão da acurácia: {0}'.format(np.std(accuracy_array))) 
print('Desvio padrão da precisão: {0}'.format(np.std(precision_array))) 

Acurácias obtidas: [95.12, 92.68, 96.59, 94.63, 95.61, 97.07, 97.07, 98.05, 97.07, 95.61, 97.07, 94.63, 94.63, 95.12, 94.15, 95.12, 95.61, 94.63, 97.56, 93.17]
Precisões obtidas: [96.95, 97.41, 97.64, 97.64, 96.27, 99.21, 98.45, 99.17, 96.18, 96.21, 98.4, 94.62, 97.78, 94.07, 93.18, 96.4, 96.99, 95.31, 98.5, 94.29]
Desvio padrão da acurácia: 1.4216872898074295
Desvio padrão da precisão: 1.6928475270974634


In [6]:
#predict example

data = [[999999, 5, 10, 10, 3, 7, 3, 8, 10, 2]]

columns = ['sample_code_number', 'clump_thickness', 'uniformity_of_cell_size', 
           'uniformity_of_cell_shape', 'marginal_adhesion', 
           'single_epithelial_cell_size', 'bare_nuclei', 'bland_chromatin', 
           'normal_nucleoli', 'mitoses']

data_std = sc.transform(pd.DataFrame(data, columns=columns))
pred = model.predict(data_std)
print(f"Predicted: {pred}")

Predicted: [0]


In [36]:
pd.DataFrame(data, columns=columns)

Unnamed: 0,sample_code_number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
0,999999,5,10,10,3,7,3,8,10,2


# Save model

In [32]:
import joblib
import os

In [33]:
current_file_dir = os.path.dirname(os.getcwd())
save_path = os.path.join(current_file_dir, "..", "..", "src/app/resources/models/v1/")

In [34]:
joblib.dump(model, f"{save_path}model.joblib")
joblib.dump(sc, f"{save_path}sc.joblib")

['/home/maksonvinicio/Documents/GitLab-GitHub/CodingChallenge-ML-API/src/scripts/../../src/app/resources/models/v1/sc.joblib']

In [4]:
from typing import List, Optional
from pydantic import BaseModel, Field
import numpy as np
import json



class Input(BaseModel):
    """Input data class for the ML model."""
    
    sample_code_number: int
    clump_thickness: int
    uniformity_of_cell_size: int
    uniformity_of_cell_shape: int
    marginal_adhesion: int
    single_epithelial_cell_size: int
    bare_nuclei: float | int
    bland_chromatin: int
    normal_nucleoli: int
    mitoses: int

In [51]:
input_data = {
  "sample_code_number": 999999,
  "clump_thickness": 5,
  "uniformity_of_cell_size": 10,
  "uniformity_of_cell_shape": 10,
  "marginal_adhesion": 3,
  "single_epithelial_cell_size": 7,
  "bare_nuclei": 3,
  "bland_chromatin": 8,
  "normal_nucleoli": 10,
  "mitoses": 2
}

In [6]:
input_data = Input(**input_data)

In [53]:
data = pd.DataFrame([input_data.dict()])

In [54]:
data_std = sc.transform(data)
pred = model.predict(data_std)

In [56]:
pred[0]

0