In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.dummy import DummyClassifier

data = pd.read_csv('breast-cancer.csv')
df = pd.DataFrame(data)

#separate X and y
X = df.drop(columns = ['diagnosis'])
y = df['diagnosis']
y = y.replace({'B': 0, 'M': 1})

#get dummies
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64','float64']).columns.tolist()

df_encoded = pd.get_dummies(X,columns=categorical_columns)

X = pd.concat([df_encoded,df[numerical_columns]],axis=1)

#scale
scaler = StandardScaler()

X = scaler.fit_transform(X)

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#define the model
base_svc = SVC()

#define a Grid Search
param_grid = {
    'kernel' : ['linear' , 'rbf' , 'poly'],
    'C' : [0.1,1,10,100]
}

grid_search = GridSearchCV(base_svc,param_grid,cv = 5, scoring = 'neg_mean_squared_error')
grid_search.fit(X_train,y_train)

#get best parameters for model
best_params = grid_search.best_params_

svc = SVC(**best_params)
svc.fit(X_train,y_train)

#make predictions
y_train_pred = svc.predict(X_train)
y_test_pred = svc.predict(X_test)

#calculate mse
accuracy_train = accuracy_score(y_train,y_train_pred)
accuracy_test = accuracy_score(y_test,y_test_pred)

#define a baseline model
baseline = DummyClassifier()
baseline.fit(X_train,y_train)
baseline_train_pred = baseline.predict(X_train)
baseline_test_pred = baseline.predict(X_test)
baseline_train_accuracy = accuracy_score(y_train,baseline_train_pred)
baseline_test_accuracy = accuracy_score(y_test,baseline_test_pred)

#r squared
r_squared_train = svc.score(X_train,y_train)
r_squared_test = svc.score(X_test,y_test)

#print
print('accuracy train = ',accuracy_train)
print('accuracy test = ',accuracy_test)
print('----------------------')
print('accuracy baseline train = ',baseline_train_accuracy)
print('accuracy baseline test = ',baseline_test_accuracy)
print('----------------------')
target_names = ['Class 0 (B)', 'Class 1 (M)']
print(classification_report(y_test, y_test_pred, target_names=target_names))


accuracy train =  0.9868131868131869
accuracy test =  0.9824561403508771
----------------------
accuracy baseline train =  0.6285714285714286
accuracy baseline test =  0.6228070175438597
----------------------
              precision    recall  f1-score   support

 Class 0 (B)       0.97      1.00      0.99        71
 Class 1 (M)       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

