In [10]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [11]:
data = pd.read_csv('data.csv')

In [12]:
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
            'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
            'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
            'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']

target = 'diagnosis'

In [13]:
X = data[features]
y = data[target].apply(lambda x: 1 if x == 'M' else 0)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Selected features for the models
selected_features = ['compactness_mean', 'concavity_mean', 'concave points_mean', 
                     'radius_worst', 'texture_worst', 'perimeter_worst', 
                     'area_worst', 'smoothness_worst', 'concave points_worst', 'symmetry_worst']


In [15]:
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Initialize and fit the StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

In [16]:
joblib.dump(scaler, 'rfe_scaler.pkl')

['rfe_scaler.pkl']

In [17]:
log_reg = LogisticRegression(max_iter=1000, random_state=2)
dt_model = DecisionTreeClassifier(random_state=2)
svm_model = SVC(kernel='linear', random_state=2)

FEATURE SELECTION FOR LOGISTIC REGRESSION

In [18]:
rfe_log_reg = RFE(log_reg, n_features_to_select=10)
rfe_log_reg.fit(X_train_scaled, y_train)
X_train_log_reg_selected = rfe_log_reg.transform(X_train_scaled)
X_test_log_reg_selected = rfe_log_reg.transform(X_test_scaled)
log_reg.fit(X_train_log_reg_selected, y_train)
log_reg_accuracy = accuracy_score(y_test, log_reg.predict(X_test_log_reg_selected))

FEATURE SELECTION FOR DECISION TREE

In [19]:
rfe_dt = RFE(dt_model, n_features_to_select=10)
rfe_dt.fit(X_train_scaled, y_train)
X_train_dt_selected = rfe_dt.transform(X_train_scaled)
X_test_dt_selected = rfe_dt.transform(X_test_scaled)
dt_model.fit(X_train_dt_selected, y_train)
dt_accuracy = accuracy_score(y_test, dt_model.predict(X_test_dt_selected))

FEATURE SELECTION FOR SVM

In [20]:
rfe_svm = RFE(svm_model, n_features_to_select=10)
rfe_svm.fit(X_train_scaled, y_train)
X_train_svm_selected = rfe_svm.transform(X_train_scaled)
X_test_svm_selected = rfe_svm.transform(X_test_scaled)
svm_model.fit(X_train_svm_selected, y_train)
svm_accuracy = accuracy_score(y_test, svm_model.predict(X_test_svm_selected))

In [21]:
print(f'Logistic Regression Accuracy: {log_reg_accuracy:.4f}')
print(f'Decision Tree Accuracy: {dt_accuracy:.4f}')
print(f'SVM Accuracy: {svm_accuracy:.4f}')

Logistic Regression Accuracy: 0.9737
Decision Tree Accuracy: 0.9123
SVM Accuracy: 0.9737


In [24]:
import numpy as np
selected_indices_log_reg = np.where(rfe_log_reg.support_)[0]  # Get the indices of the selected features
selected_features_log_reg = [selected_features[i] for i in selected_indices_log_reg]
print(f'Selected Features for Logistic Regression: {selected_features_log_reg}')

# Decision Tree
selected_indices_dt = np.where(rfe_dt.support_)[0]
selected_features_dt = [selected_features[i] for i in selected_indices_dt]
print(f'Selected Features for Decision Tree: {selected_features_dt}')

# SVM
selected_indices_svm = np.where(rfe_svm.support_)[0]
selected_features_svm = [selected_features[i] for i in selected_indices_svm]
print(f'Selected Features for SVM: {selected_features_svm}')

Selected Features for Logistic Regression: ['compactness_mean', 'concavity_mean', 'concave points_mean', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'concave points_worst', 'symmetry_worst']
Selected Features for Decision Tree: ['compactness_mean', 'concavity_mean', 'concave points_mean', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'concave points_worst', 'symmetry_worst']
Selected Features for SVM: ['compactness_mean', 'concavity_mean', 'concave points_mean', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'concave points_worst', 'symmetry_worst']


In [25]:
import joblib

# Save the trained Logistic Regression model
joblib.dump(svm_model, 'svm_model.pkl')

['svm_model.pkl']

In [27]:
print(X_train_scaled.shape)  # Should print (num_samples, num_features)


(455, 10)
