<a href="https://colab.research.google.com/github/JoyNjihia/WEEK-4-ASSIGNMENT-AI-FOR-SOFTWARE-ENGINEERING/blob/main/task4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer # Import SimpleImputer

# Load the dataset
# Note: Ensure the dataset is downloaded and placed in the correct directory
# The dataset can be loaded from Kaggle: https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-diagnostic-dataset
df = pd.read_csv('breast-cancer-wisconsin-data.csv')

# Data preprocessing
# Drop the 'id' column which is not relevant
df = df.drop('id', axis=1)

# Handle missing values
# Calculate the mean only for numeric columns
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Encode categorical variables (if any)
# In this case, 'diagnosis' is the target column, and it's categorical
# Convert the target variable to binary (0 for 'B', 1 for 'M') if needed

# Split features and target
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Convert 'diagnosis' to binary (0 for 'B', 1 for 'M') if needed
y = y.apply(lambda x: 1 if x == 'M' else 0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (optional but recommended for many machine learning models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Apply the *same* scaling to test set

# Handle class imbalance (optional)

# Apply imputation *after* scaling and *before* SMOTE
# Fit the imputer *only* on the scaled training data
imputer = SimpleImputer(strategy='mean') # Or 'median', 'most_frequent', etc.
X_train_imputed = imputer.fit_transform(X_train_scaled)

# Transform the scaled test data using the *fitted* imputer
X_test_imputed = imputer.transform(X_test_scaled)

# Now apply SMOTE to the imputed training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_imputed, y_train)

# Model training with Random Forest after SMOTE
model_res = RandomForestClassifier(random_state=42)
# Train the model on the resampled and imputed training data
model_res.fit(X_train_res, y_train_res)

# Predict on the *imputed* test set
y_pred_res = model_res.predict(X_test_imputed)

# Evaluate the model after SMOTE
print("\nAccuracy after SMOTE:", accuracy_score(y_test, y_pred_res))
print("\nF1 Score after SMOTE:", f1_score(y_test, y_pred_res, average='weighted'))

# The remaining code for model evaluation, tuning, etc. should now work correctly
# as it uses the properly scaled and imputed data where needed.

# Model training with Random Forest (original, before SMOTE) - Keep this if you still want to evaluate the model without SMOTE
model = RandomForestClassifier(random_state=42)
# Train the model on the scaled but *not* imputed or resampled training data (assuming no NaNs were introduced by scaling)
# Note: If scaling introduced NaNs, you would need to impute X_train_scaled before fitting this model too.
# Since your original non-SMOTE part worked, it implies scaling didn't create NaNs in the numeric columns.
model.fit(X_train_scaled, y_train)

# Predict on the scaled test set (assuming no NaNs introduced by scaling)
y_pred = model.predict(X_test_scaled)

# Evaluate the model (original)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Hyperparameter tuning (optional) - Use the scaled and imputed training data for grid search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Fit GridSearchCV on the resampled and imputed training data
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_res, y_train_res)


print("\nBest Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Predict with the best model on the imputed test set
y_pred_best = best_model.predict(X_test_imputed)
print("\nBest Model Accuracy:", accuracy_score(y_test, y_pred_best))
print("\nBest Model F1 Score:", f1_score(y_test, y_pred_best, average='weighted'))

# Feature importance analysis - Use the model trained on scaled data (before SMOTE/imputation if you want feature names)
# Note: Feature importances on SMOTE'd data might be less interpretable with original feature names.
# If using X_train_imputed for training, the columns might be different if imputation dropped one.
# Let's use the model trained on X_train_scaled (31 features) if it fitted without error.
# If you want importances from the SMOTE'd model, you'd need to map back or check column names of X_train_imputed.
# Given the original model fit X_train_scaled (31 features), let's use that.
feature_importances = pd.DataFrame({
    # X.columns are the original column names (31 features)
    'Feature': X.columns,
    # Use importances from the model trained on 31 features
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)


# Save the model (optional)
import joblib
# Save the best model found from the grid search on the resampled data
joblib.dump(best_model, 'breast_cancer_model_resampled.pkl')
# You might also want to save the imputer and scaler if you need to preprocess new data for prediction
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(imputer, 'imputer.pkl')

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count



Accuracy after SMOTE: 0.9649122807017544

F1 Score after SMOTE: 0.9649122807017544

Accuracy: 0.9649122807017544
F1 Score: 0.9647382344750765

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114


Confusion Matrix:
[[70  1]
 [ 3 40]]
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Best Model Accuracy: 0.9649122807017544

Best Model F1 Score: 0.9649122807017544

Feature Importances:
                    Feature  Importance
27     concave points_worst    0.157233
23               area_worst    0.125468
7       concave points_mean    0.121484
22          perimeter_worst    0.118

['imputer.pkl']