## Importing the data and Exploring the Data

In [1]:
import pandas as pd
import requests
import zipfile
import io
from scipy.io import arff
from sklearn.model_selection import train_test_split
import warnings
import os
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score

In [2]:
f_zip = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00212/vertebral_column_data.zip'
r = requests.get(f_zip, stream=True)
Vertebral_zip = zipfile.ZipFile(io.BytesIO(r.content))
Vertebral_zip.extractall()

In [3]:
data = arff.loadarff('column_2C_weka.arff')
df = pd.DataFrame(data[0])

In [4]:
df.shape

(310, 7)

In [5]:
df.columns

Index(['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle',
       'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis', 'class'],
      dtype='object')

In [6]:
class_mapper = {b'Abnormal':1,b'Normal':0}
df['class']=df['class'].replace(class_mapper)

## Step 1: Preparing the data for Training

In [7]:
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

In [8]:
df.columns
df.head()

Unnamed: 0,class,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
0,1,63.027817,22.552586,39.609117,40.475232,98.672917,-0.2544
1,1,39.056951,10.060991,25.015378,28.99596,114.405425,4.564259
2,1,68.832021,22.218482,50.092194,46.613539,105.985135,-3.530317
3,1,69.297008,24.652878,44.311238,44.64413,101.868495,11.211523
4,1,49.712859,9.652075,28.317406,40.060784,108.168725,7.918501


## Splitting the data

In [9]:
from sklearn.model_selection import train_test_split
train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42, stratify=df['class'])

In [10]:
train

Unnamed: 0,class,pelvic_incidence,pelvic_tilt,lumbar_lordosis_angle,sacral_slope,pelvic_radius,degree_spondylolisthesis
202,1,76.314028,41.933683,93.284863,34.380345,132.267286,101.218783
178,1,80.654320,26.344379,60.898118,54.309940,120.103493,52.467552
68,1,72.076278,18.946176,51.000000,53.130102,114.213013,1.010041
118,1,65.536003,24.157487,45.775170,41.378515,136.440302,16.378086
182,1,75.437748,31.539454,89.600000,43.898294,106.829590,54.965789
...,...,...,...,...,...,...,...
282,0,53.683380,13.447022,41.584297,40.236358,113.913703,2.737035
265,0,48.170746,9.594217,39.710920,38.576530,135.623310,5.360051
180,1,37.903910,4.479099,24.710274,33.424811,157.848799,33.607027
28,1,44.551012,21.931147,26.785916,22.619865,111.072920,2.652321


In [11]:
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42, stratify=test_and_validate['class'])

In [12]:
print(train.shape)
print(test.shape)
print(validate.shape)

(248, 7)
(31, 7)
(31, 7)


In [13]:
print(train['class'].value_counts())
print(test['class'].value_counts())
print(validate['class'].value_counts())

class
1    168
0     80
Name: count, dtype: int64
class
1    21
0    10
Name: count, dtype: int64
class
1    21
0    10
Name: count, dtype: int64


## Step 2: Training the model

In [20]:
from xgboost import XGBClassifier
model = XGBClassifier(objective='binary:logistic', eval_metric='auc', num_round=42)
print(model.fit(train.drop(['class'], axis = 1).values, train['class'].values))
print("Training Completed")

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...)
Training Completed
