# <font color='00A9FF'>Importing Libraries & Data<font>

In [None]:
#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

Python version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
pandas version: 1.5.3


# <font color='00A9FF'>Csv files are uploaded from github repository

In [None]:
import os
!git clone https://github.com/Jhonier-Jimenez/kidney-stone-prediction.git
os.chdir('/content/kidney-stone-prediction/fase-1')

Cloning into 'kidney-stone-prediction'...
remote: Enumerating objects: 78, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 78 (delta 22), reused 6 (delta 0), pack-reused 0[K
Receiving objects: 100% (78/78), 1.22 MiB | 9.90 MiB/s, done.
Resolving deltas: 100% (22/22), done.


# <font color='00A9FF'>Making our datasets available in our coding environment<font>

In [None]:
TRAIN_FILENAME = "train.csv"
TEST_FILENAME = "test.csv"

# <font color='00A9FF'>Reading in our csv files and putting them into a dataframe object<font>

In [None]:
train_data = pd.read_csv(TRAIN_FILENAME)
print(train_data.shape)
print('-'*50)

test_data = pd.read_csv(TEST_FILENAME)
print(test_data.shape)
print('-'*50)

(414, 8)
--------------------------------------------------
(276, 7)
--------------------------------------------------


# <font color='00A9FF'>Observing small samples of our datasets with .head() in order to get familiar with how our dataset looks and is organized<font>

In [None]:
train_data.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1


In [None]:
test_data.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc
0,414,1.017,5.24,345,11.5,152,1.16
1,415,1.02,5.68,874,29.0,385,3.46
2,416,1.024,5.36,698,19.5,354,13.0
3,417,1.02,5.33,668,25.3,252,3.46
4,418,1.011,5.87,567,29.0,457,2.36


# <font color='00A9FF'>Data Cleaning for 'id'<font>

In [None]:
#delete the ID column
drop_column = ['id']
train_data.drop(drop_column, axis=1, inplace = True)
train_data.head()

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
0,1.013,6.19,443,14.8,124,1.45,0
1,1.025,5.4,703,23.6,394,4.18,0
2,1.009,6.13,371,24.5,159,9.04,0
3,1.021,4.91,442,20.8,398,6.63,1
4,1.021,5.53,874,17.8,385,2.21,1


# <font color='00A9FF'>Evaluate XGBoost Classifier Models With Train and Test Sets<font><a class='anchor' id='top'></a>

<div class="alert alert-block alert-info">

📌 Note 1:

We are just implementing a bare minimum XGBClassifier model to get a feel for splitting train and test sets and running a classifier model on the data to get a feel for implementation

</div>

<div class="alert alert-block alert-info">

📌 Note 2:

MinMaxScaler() in scikit-learn is used for data normalization (a.k.a feature scaling). Data normalization is not necessary for decision trees. Since XGBoost is based on decision trees, decision trees do not require normalization of their inputs.

While decision trees have a natural resistance to outliers, boosted trees are susceptible, since new trees are built off the residual. Normalization, or even just a log transform, will give you better protection from outliers.

</div>

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# split data into X and y

X = train_data.drop(['target'], axis=1)
y = train_data['target']

# split data into train and test sets
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.24, random_state=7)

# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

# make predictions for test data

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 68.00%


# <font color='00A9FF'>Save the model using pickle<font>

In [None]:
model_folder = '/content/kidney-stone-prediction/model'

if not os.path.exists(model_folder):
    os.mkdir(model_folder)

In [1]:
import pickle

with open((model_folder + '/model.plk'), "wb") as f:
    pickle.dump(model, f)

AttributeError: ignored

# <font color='00A9FF'>Load the saved model<font>

In [None]:
with open(model_folder + '/model.plk', 'rb') as f:
    loaded_model = pickle.load(f)

# <font color='00A9FF'>Test the loaded model<font>

In [None]:
new_y_pred = loaded_model.predict(X_test)
new_predictions = [round(value) for value in new_y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, new_predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 68.00%
