In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv("data.csv")
print(df.head())


# Drop unneeded columns
df.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)

# Encode target labels: M = malignant, B = benign
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Simulate issue priority:
# - High: malignant (1)
# - Low: benign (0)
# - Medium: random subset of benign to simulate complexity
import numpy as np
df['priority'] = df['diagnosis'].apply(lambda x: 'High' if x == 1 else 'Low')

# Upgrade a portion of 'Low' to 'Medium'
medium_idx = df[df['priority'] == 'Low'].sample(frac=0.3, random_state=42).index
df.loc[medium_idx, 'priority'] = 'Medium'

# Encode 'priority' labels
le = LabelEncoder()
df['priority_encoded'] = le.fit_transform(df['priority'])

# Prepare features and labels
X = df.drop(['diagnosis', 'priority', 'priority_encoded'], axis=1)
y = df['priority_encoded']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check shape of the data
print("Feature set shape (X):", X.shape)
print("Label distribution (y):")
print(df['priority'].value_counts())
print("\nEncoded labels:", df['priority_encoded'].unique())


         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"F1-score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.76
F1-score: 0.72

Classification Report:
              precision    recall  f1-score   support

        High       0.95      0.95      0.95        43
         Low       0.69      0.90      0.78        50
      Medium       0.17      0.05      0.07        21

    accuracy                           0.76       114
   macro avg       0.60      0.63      0.60       114
weighted avg       0.69      0.76      0.72       114

