In [None]:
# Author: Justin Collier
# Date Created: 6/01/2024
# Last Altered: 6/01/2024

# Library Imports

# Data manipulation libraries
import pandas as pd
import numpy as np

# Counter
from collections import Counter

# MatPlotLib
import matplotlib.pyplot as plt

# Seaborn
import seaborn as sns

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [None]:
# Load the dataset

# Creating our own labels for the dataset based on the info provided from the source.
column_names = ['ID', 'Class', 'Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5', 'Feature6', 
                'Feature7', 'Feature8', 'Feature9', 'Feature10', 'Feature11', 'Feature12', 'Feature13', 
                'Feature14', 'Feature15', 'Feature16', 'Feature17', 'Feature18', 'Feature19', 'Feature20', 
                'Feature21', 'Feature22', 'Feature23', 'Feature24', 'Feature25', 'Feature26', 'Feature27', 
                'Feature28', 'Feature29', 'Feature30']

# Reading our dataset and assigning the labels
datafile = pd.read_csv("./wdbc.data", names=column_names, delimiter=',')

# Display info for all fields I.E. Column & Data Type
datafile.info()

In [None]:
# Preprocess the data
X = datafile.drop(['ID', 'Class'], axis=1)
y = datafile['Class'].map({'M': 1, 'B': 0})

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# For model 2 we are using Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))





In [None]:
# Feature importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Sort feature importances in descending order
indices = np.argsort(feature_importance)[::-1]

# Rearrange feature names based on importance ranking
sorted_feature_names = [feature_names[i] for i in indices]

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance[indices], y=sorted_feature_names)
plt.title("Feature Importance")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()