In [1]:
%%capture
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go


from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
train_data_original = pd.read_csv("train.csv")
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:

# Adding new column
train_data["BMI"] = train_data["Weight"] / (train_data['Height'] ** 2)

train_data['Gender'] = train_data['Gender'].map({'Male':  1, 'Female':  0})
train_data['FAVC'] = train_data['FAVC'].map({'yes':  1, 'no':  0})
train_data['SCC'] = train_data['SCC'].map({'yes':  1, 'no':  0})
train_data['SMOKE'] = train_data['SMOKE'].map({'yes':  1, 'no':  0})
train_data['family_history_with_overweight'] = train_data['family_history_with_overweight'].map({'yes':  1, 'no':  0})
train_data['CAEC'] = train_data['CAEC'].map({'no':  0, 'Sometimes':  1, 'Frequently': 2, "Always": 3})
train_data['CALC'] = train_data['CALC'].map({'no':  0, 'Sometimes':  1, 'Frequently': 2})
train_data['MTRANS'] = train_data['MTRANS'].map({'Public_Transportation':  0, 'Automobile':  1, 'Walking': 2, "Motorbike": 3})
train_data['NObeyesdad'] = train_data['NObeyesdad'].map({'Overweight_Level_II': 0, 'Normal_Weight': 1, 'Insufficient_Weight': 2, 'Obesity_Type_III': 3, 'Obesity_Type_II': 4, 'Overweight_Level_I': 5, 'Obesity_Type_I': 6})

In [4]:
train = train_data.drop(['id'], axis=1)
test = test_data.drop(['id'], axis=1)

In [5]:
test['BMI'] = test['Weight'] / (test['Height'] ** 2)

test['Gender'] = test['Gender'].map({'Male':  1, 'Female':  0})
test['FAVC'] = test['FAVC'].map({'yes':  1, 'no':  0})
test['SCC'] = test['SCC'].map({'yes':  1, 'no':  0})
test['SMOKE'] = test['SMOKE'].map({'yes':  1, 'no':  0})
test['family_history_with_overweight'] = test['family_history_with_overweight'].map({'yes':  1, 'no':  0})
test['CAEC'] = test['CAEC'].map({'no':  0, 'Sometimes':  1, 'Frequently': 2, "Always": 3})
test['CALC'] = test['CALC'].map({'no':  0, 'Sometimes':  1, 'Frequently': 2})
test['MTRANS'] = test['MTRANS'].map({'Public_Transportation':  0, 'Automobile':  1, 'Walking': 2, "Motorbike": 3})

In [6]:
X = train.drop(['NObeyesdad'], axis=1)
y = train['NObeyesdad']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
# Confusion matrix
def plot_confusion_matrix(y_val, y_pred):
    from sklearn.metrics import confusion_matrix, classification_report
    from lightgbm import LGBMClassifier, plot_importance
    plt.figure(figsize=(15, 6))
    conf_matrix = confusion_matrix(y_val, y_pred)
    conf_labels = [f'{i}' for i in range(conf_matrix.shape[0])]
    conf_matrix_df = pd.DataFrame(conf_matrix, columns=conf_labels, index=conf_labels)
    plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.xticks(np.arange(conf_matrix.shape[0]), conf_labels, rotation=45)
    plt.yticks(np.arange(conf_matrix.shape[0]), conf_labels)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            plt.text(j, i, str(conf_matrix[i, j]), ha='center', va='center', color='black')
    plt.grid(False)
    plt.show()
    

In [19]:
def make_svm_predictions(X_train, X_val, y_train, y_val, C_parameter=None, kernel=None):
    if kernel and C_parameter is not None:
        classifier = SVC(kernel=kernel, C=C_parameter)
    else:
        classifier = SVC()
    classifier.fit(X_train, y_train)
    classifier_preds = classifier.predict(X_val)

    print(f'{kernel} classifier accuracy score with C={C_parameter} : {accuracy_score(y_val, classifier_preds) * 100:0.2f}%')
    return classifier_preds

In [17]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.fit_transform(X_val)

In [18]:
default_preds = make_svm_predictions(X_train, X_val, y_train, y_val)
rbf_preds = make_svm_predictions(X_train, X_val, y_train, y_val, "rbf", C_parameter=100.0)
polynomial_preds = make_svm_predictions(X_train, X_val, y_train, y_val, 'poly', 100.0)
sigmoid_preds = make_svm_predictions(X_train, X_val, y_train, y_val, 'sigmoid', 1.0)
linear_preds = make_svm_predictions(X_train, X_val, y_train, y_val, 'linear', 1.0)

None classifier accuracy score with C=None : 82.54%
rbf classifier accuracy score with C=100.0 : 86.97%
poly classifier accuracy score with C=100.0 : 86.73%
sigmoid classifier accuracy score with C=1.0 : 1.04%
linear classifier accuracy score with C=1.0 : 87.21%


In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
grid = GridSearchCV(SVC(),param_grid,refit = True, verbose=2)

grid.fit(X_train,y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ........................C=1, gamma=1, kernel=linear; total time=  11.0s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=  12.9s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=  10.5s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=  14.0s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=  15.0s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=  43.4s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=  38.4s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=  39.0s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=  35.7s
[CV] END ...........................C=1, gamma=1, kernel=rbf; total time=  35.1s
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=  10.0s
[CV] END ......................C=1, gamma=0.1, 

KeyboardInterrupt: 

In [None]:
grid.best_params_