In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [3]:
#Dataset Preprocessing

pre_data = pd.read_csv('/Users/gracechoi/Documents/y4q4 - summer/cs 148/group_proj/diabetes.csv')
data = pre_data.sample(n=50000, random_state=42)

data['gender'] = data['gender'].map({'Male': 0, 'Female': 1, 'Other': 2})
data['smoking_history'] = data['smoking_history'].map({
    'not current': 0, 
    'former': 1, 
    'No Info': 2,  
    'current': 3, 
    'never': 4, 
    'ever': 5
})

data.dropna(inplace=True)

X = data.drop('diabetes', axis=1)
y = data['diabetes']

In [4]:
import pandas as pd
from IPython.display import display

display(data)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
75721,1,13.0,0,0,2,20.82,5.8,126,0
80184,1,3.0,0,0,2,21.00,5.0,145,0
19864,0,63.0,0,0,1,25.32,3.5,200,0
76699,1,2.0,0,0,4,17.43,6.1,126,0
92991,1,33.0,0,0,0,40.08,6.2,200,1
...,...,...,...,...,...,...,...,...,...
33072,0,18.0,0,0,2,27.32,4.5,90,0
95599,0,55.0,1,0,1,23.46,6.6,140,0
28006,0,64.0,1,0,4,27.06,5.8,159,0
90487,1,21.0,0,0,2,24.27,5.7,130,0


In [5]:
#Splitting and scaling

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
#Grid search to find best parameters

param_grid = {
    'C': [0.1, 1, 10, 100, 300],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.01, 0.1, 1, 10],
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print('Best Parameters:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)

Best Parameters: {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
Best Score: 0.9670249999999999


In [7]:
#Train and evaluate with best model

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.97
Confusion Matrix:
[[9149   17]
 [ 314  520]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      9166
           1       0.97      0.62      0.76       834

    accuracy                           0.97     10000
   macro avg       0.97      0.81      0.87     10000
weighted avg       0.97      0.97      0.96     10000

