In [4]:
import pandas as pd

# Load the dataset with a corrected file path
data = pd.read_csv(r'C:\Users\nadhirah\StudioProjects\onco-insight\breast-cancer-wisconsin.data', header=None)

# Rename columns based on the feature descriptions provided
data.columns = [
    'Sample_code_number', 'Clump_Thickness', 'Uniformity_of_Cell_Size',
    'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size',
    'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class'
]

# Display the first few rows to confirm column renaming
print(data.head())


   Sample_code_number  Clump_Thickness  Uniformity_of_Cell_Size  \
0             1000025                5                        1   
1             1002945                5                        4   
2             1015425                3                        1   
3             1016277                6                        8   
4             1017023                4                        1   

   Uniformity_of_Cell_Shape  Marginal_Adhesion  Single_Epithelial_Cell_Size  \
0                         1                  1                            2   
1                         4                  5                            7   
2                         1                  1                            2   
3                         8                  1                            3   
4                         1                  3                            2   

  Bare_Nuclei  Bland_Chromatin  Normal_Nucleoli  Mitoses  Class  
0           1                3                1        1

In [5]:
# Replace missing values (assuming they are represented as '?')
data['Bare_Nuclei'] = pd.to_numeric(data['Bare_Nuclei'], errors='coerce')
data.fillna(data.mean(), inplace=True)  # Replace NaN values with the mean


In [6]:
# Ensure 'Class' is categorical (binary)
data['Class'] = data['Class'].astype('category')


In [7]:
from sklearn.preprocessing import StandardScaler

# Scale the numerical features
scaler = StandardScaler()
features = data.drop(columns=['Sample_code_number', 'Class'])  # Drop non-feature columns
features_scaled = scaler.fit_transform(features)


In [8]:
from sklearn.model_selection import train_test_split

X = features_scaled
y = data['Class']

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9785714285714285
Classification Report:
               precision    recall  f1-score   support

           2       0.98      0.99      0.98        95
           4       0.98      0.96      0.97        45

    accuracy                           0.98       140
   macro avg       0.98      0.97      0.98       140
weighted avg       0.98      0.98      0.98       140



In [12]:
import joblib
joblib.dump(model, 'breast_cancer_model.pkl')


['breast_cancer_model.pkl']

In [13]:
import pickle

# Save the model to a file
with open('breast_cancer_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [14]:
import pickle

# Load the saved model
with open('breast_cancer_model.pkl', 'rb') as f:
    model = pickle.load(f)


In [15]:
import pandas as pd

# Example test data (the structure must match the training data)
test_data = pd.DataFrame({
    'Clump_Thickness': [5],
    'Uniformity_of_Cell_Size': [1],
    'Uniformity_of_Cell_Shape': [1],
    'Marginal_Adhesion': [1],
    'Single_Epithelial_Cell_Size': [2],
    'Bare_Nuclei': [1],
    'Bland_Chromatin': [3],
    'Normal_Nucleoli': [1],
    'Mitoses': [1]
})


In [16]:
# Example: Scaling (if you scaled the data during training)
from sklearn.preprocessing import StandardScaler

# Assuming you used StandardScaler for scaling
scaler = StandardScaler()

# Apply the same scaling to the test data
test_data_scaled = scaler.fit_transform(test_data)


In [17]:
# Predict using the trained model
predictions = model.predict(test_data_scaled)

# Print the predictions
print(f"Predicted class: {predictions}")


Predicted class: [2]


In [18]:
from sklearn.metrics import accuracy_score, classification_report

# Assume `y_test` is the actual labels for your test data
y_test = [2]  # Example: replace with real test labels
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           2       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [19]:
model = joblib.load('models/breast_cancer_model.pkl')
