## Model Validation Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

### 1. Evaluate using a train and a test split

In [3]:
from pandas import read_csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
filename='diabetes.csv'
data = read_csv(filename)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [13]:
data.shape

(768, 9)

In [15]:
array= data.values
x = array[:,0:8]
y = array[:,8]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
model = LogisticRegression()
model.fit(x_train, y_train)
train_result = model.score(x_train, y_train)
test_result = model.score(x_test, y_test)

In [16]:
train_result*100, test_result*100

(76.35009310986965, 77.92207792207793)

In [17]:
array= data.values
x = array[:,0:8]
y = array[:,8]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
model = LogisticRegression()
model.fit(x_train, y_train)
train_result = model.score(x_train, y_train)
test_result = model.score(x_test, y_test)

In [18]:
train_result*100, test_result*100

(77.28119180633148, 78.35497835497836)

In [19]:
array= data.values
x = array[:,0:8]
y = array[:,8]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)
model = LogisticRegression()
model.fit(x_train, y_train)
train_result = model.score(x_train, y_train)
test_result = model.score(x_test, y_test)

In [20]:
train_result*100, test_result*100

(78.58472998137802, 76.19047619047619)

In [21]:
array= data.values
x = array[:,0:8]
y = array[:,8]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=3)
model = LogisticRegression()
model.fit(x_train, y_train)
train_result = model.score(x_train, y_train)
test_result = model.score(x_test, y_test)

In [22]:
train_result*100, test_result*100

(79.14338919925513, 75.32467532467533)

##### As you can see as the randome state value changes the accuracy changes every time you run the code so this cannot be a good method to use as we cannot control the randomness of the selection of data from the dataset for the training and testing size

### 2. Evaluate using Cross Validation

In [23]:
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=10, shuffle = True, random_state=42)
model = LogisticRegression(max_iter=400)
results = cross_val_score(model, x, y , cv = kfold)

In [24]:
results

array([0.7012987 , 0.80519481, 0.72727273, 0.84415584, 0.83116883,
       0.67532468, 0.85714286, 0.77922078, 0.69736842, 0.78947368])

In [26]:
results.mean()*100

77.0762132604238

In [27]:
results.var()*100

0.39315957318574507

In [28]:
results.std()*100

6.270243800569042

##### when Variance in scores is low it indicates that the model can perform or predict well on unseen data but the opposite indicates that the model is dependent on training data and will not predict good results on testing data

### 3. Evaluate using Leave One Out Validation

In [29]:
from sklearn.model_selection import LeaveOneOut
loocv = LeaveOneOut()
model = LogisticRegression(max_iter=300)
results = cross_val_score(model, x , y , cv = loocv)

In [35]:
results

array([1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.,
       1., 1., 0., 1., 1.

In [30]:
results.mean()*100

77.60416666666666

In [37]:
results.std()*100

41.68944689773287

In [38]:
results.var()*100

17.380099826388886