In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve,  roc_auc_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV


In [2]:
cancer_data = pd.read_csv('cancer-patient-data-sets.csv')
cancer_data.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [3]:
cancer_data['Level']=cancer_data['Level'].map({'Low': 0, 'High': 1,'Medium': 1})
cancer_data.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,0
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,1
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,1
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,1
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,1


In [4]:
if 'index' in cancer_data.columns:
    cancer_data = cancer_data.drop('index', axis=1)

if 'Patient Id' in cancer_data.columns:
    cancer_data = cancer_data.drop('Patient Id', axis=1)
cancer_data.head()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,0
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,1
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,1
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,1
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,1


In [5]:
target = cancer_data['Level']

In [6]:
seed = 7
train_data, test_data = train_test_split(cancer_data, test_size=0.2, random_state=seed, stratify=target)
# shape of the datasets
print('\nShape of training data :',train_data.shape)
print('\nShape of testing data :',test_data.shape)
# class distribution of the training data
print(pd.value_counts(train_data['Level']))
# class distribution of the test data
print(pd.value_counts(test_data['Level']))


Shape of training data : (800, 24)

Shape of testing data : (200, 24)
Level
1    558
0    242
Name: count, dtype: int64
Level
1    139
0     61
Name: count, dtype: int64


In [7]:
# separate the independent and target variables from training data
train_x = train_data.drop(columns=['Level'],axis=1)
train_y = train_data['Level']
# separate the independent and target variables from test data
test_x = test_data.drop(columns=['Level'],axis=1)
test_y = test_data['Level']

In [15]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1, 10]
}

# Initialize the SVM model
svm_model = SVC()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, scoring='accuracy', cv=3)

# Perform grid search on the training data
grid_search.fit(train_x, train_y)


In [9]:
# Get the best parameters from grid search
best_params = grid_search.best_params_

# Train the SVM model with the best parameters
best_svm_model = SVC(**best_params)
best_svm_model.fit(train_x, train_y)

# Make predictions on the test set
predictions = best_svm_model.predict(test_x)

# Evaluate the model
accuracy = accuracy_score(test_y, predictions)

# Display results
print(f'Best Parameters: {best_params}')


Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}


In [13]:
# F1 Score
f1 = f1_score(test_y, predictions, average='weighted')
print("F1 Score:", f1)

F1 Score: 1.0
