In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, r2_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans

# Load data from CSV
file_path = "cleaned_dataset.csv"  # Replace with the path to your CSV file
df = pd.read_csv(file_path)

# Rename columns if needed (ensure consistency with your code)
df.rename(columns={"diagnosis": "target"}, inplace=True)  # Rename 'diagnosis' to 'target' for consistency

# Encode the target variable ('diagnosis') to numerical (0 and 1)
df['target'] = df['target'].map({'M': 0, 'B': 1})  # M: malignant, B: benign

# Feature and Target
X = df.drop('target', axis=1)
y = df['target']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for algorithms like SVM, KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)
dt_pred = dt_model.predict(X_test_scaled)
print(f"\nDecision Tree Accuracy: {accuracy_score(y_test, dt_pred)}")

# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)
print(f"\nRandom Forest Accuracy: {accuracy_score(y_test, rf_pred)}")

# Support Vector Classifier (SVC)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)
print(f"\nSVM Accuracy: {accuracy_score(y_test, svm_pred)}")

# K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
knn_pred = knn_model.predict(X_test_scaled)
print(f"\nKNN Accuracy: {accuracy_score(y_test, knn_pred)}")

# Naive Bayes Classifier
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)
nb_pred = nb_model.predict(X_test_scaled)
print(f"\nNaive Bayes Accuracy: {accuracy_score(y_test, nb_pred)}")






# Linear Regression (Regression)
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
linear_pred = linear_model.predict(X_test_scaled)
print(f"\nLinear Regression R^2: {r2_score(y_test, linear_pred)}")

# Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train_scaled, y_train)
dt_regressor_pred = dt_regressor.predict(X_test_scaled)
print(f"\nDecision Tree Regressor R^2: {r2_score(y_test, dt_regressor_pred)}")

# Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train_scaled, y_train)
rf_regressor_pred = rf_regressor.predict(X_test_scaled)
print(f"\nRandom Forest Regressor R^2: {r2_score(y_test, rf_regressor_pred)}")

# Support Vector Regression (SVR)
svr_model = SVR(kernel='linear')
svr_model.fit(X_train_scaled, y_train)
svr_pred = svr_model.predict(X_test_scaled)
print(f"\nSVR R^2: {r2_score(y_test, svr_pred)}")

# K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train_scaled)
print(f"\nK-Means Cluster Centers: {kmeans.cluster_centers_}")
kmeans_pred = kmeans.predict(X_test_scaled)
print(f"Clusters predicted by KMeans: {np.unique(kmeans_pred)}")


# Collect classification results
classification_results = {
    "Decision Tree": accuracy_score(y_test, dt_pred),
    "Random Forest": accuracy_score(y_test, rf_pred),
    "SVM": accuracy_score(y_test, svm_pred),
    "KNN": accuracy_score(y_test, knn_pred),
    "Naive Bayes": accuracy_score(y_test, nb_pred)
}

# Collect regression results
regression_results = {
    "Linear Regression": r2_score(y_test, linear_pred),
    "Decision Tree Regressor": r2_score(y_test, dt_regressor_pred),
    "Random Forest Regressor": r2_score(y_test, rf_regressor_pred),
    "SVR": r2_score(y_test, svr_pred)
}

# Sort models
top_classification = sorted(classification_results.items(), key=lambda x: x[1], reverse=True)[:3]
top_regression = sorted(regression_results.items(), key=lambda x: x[1], reverse=True)[:3]

# Print results
print("\nTop 3 Classification Models:")
for model, score in top_classification:
    print(f"{model}: Accuracy = {score:.2f}")

print("\nTop 3 Regression Models:")
for model, score in top_regression:
    print(f"{model}: R^2 = {score:.2f}")




Decision Tree Accuracy: 0.9473684210526315

Random Forest Accuracy: 0.9649122807017544

SVM Accuracy: 0.956140350877193

KNN Accuracy: 0.9473684210526315

Naive Bayes Accuracy: 0.9649122807017544

Linear Regression R^2: 0.7271016126223555

Decision Tree Regressor R^2: 0.7759580740255486

Random Forest Regressor R^2: 0.8605413691451032

SVR R^2: 0.7077728093534708

K-Means Cluster Centers: [[-0.46528158 -0.22913219 -0.48506878 -0.46017817 -0.33698586 -0.54167022
  -0.58348398 -0.59081657 -0.33044035 -0.17410864 -0.40610396 -0.02880428
  -0.41196112 -0.37605872 -0.03811366 -0.36173664 -0.32659818 -0.396698
  -0.09232869 -0.23523586 -0.50383462 -0.24916291 -0.52257079 -0.48620178
  -0.33373666 -0.50822975 -0.54425486 -0.58513965 -0.3213992  -0.35982936]
 [ 0.89178969  0.43917003  0.92971515  0.88200816  0.64588956  1.03820125
   1.1183443   1.13239843  0.63334401  0.33370822  0.77836593  0.0552082
   0.78959215  0.72077921  0.07305119  0.69332857  0.62597984  0.76033783
   0.17696332  0.