<a href="https://colab.research.google.com/github/Mulat-K/Machine-Learning-Mastery-with-Python/blob/main/EPMLAWR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Evaluate the Performance of Machine
 Learning Algorithms with Resampling**

# **Split into Train and Test Sets**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure all values are numeric and handle any missing data
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Split into input (X) and output (Y)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Create train/test split
test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Train and evaluate Logistic Regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)

# Output accuracy
print(f"Accuracy: {result * 100:.3f}%")

Accuracy: 78.740%


# ***K-fold Cross-Validation***

In [4]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression

# Load dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure all values are numeric and drop missing values
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Separate into input and output
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Setup cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
model = LogisticRegression(max_iter=200)

# Evaluate model
results = cross_val_score(model, X, Y, cv=kfold)
print(f"Accuracy: {results.mean() * 100:.3f}% ({results.std() * 100:.3f}%)")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 77.216% (4.968%)


# **Leave One Out Cross-Validation**

In [5]:
import pandas as pd
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.linear_model import LogisticRegression

# Load dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure all values are numeric and handle missing values
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Split into input (X) and output (Y)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Setup Leave-One-Out Cross Validation (LOOCV)
loocv = LeaveOneOut()
model = LogisticRegression(max_iter=200)

# Evaluate the model using LOOCV
results = cross_val_score(model, X, Y, cv=loocv)
print(f"Accuracy: {results.mean() * 100:.3f}% ({results.std() * 100:.3f}%)")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy: 77.604% (41.689%)


# **Repeated Random Test-Train Splits**

In [6]:
import pandas as pd
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.linear_model import LogisticRegression

# Load dataset
filename = '/content/sample_data/pima-indians-diabetes.data.csv'
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=column_names, header=None)

# Ensure all values are numeric and handle missing values
dataframe = dataframe.apply(pd.to_numeric, errors='coerce')
dataframe.dropna(inplace=True)

# Split into input (X) and output (Y)
X = dataframe.iloc[:, 0:8].values
Y = dataframe.iloc[:, 8].values

# Setup Shuffle Split Cross Validation
n_splits = 10
test_size = 0.33
seed = 7
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

# Initialize the model
model = LogisticRegression(max_iter=200)

# Evaluate the model using Shuffle Split Cross Validation
results = cross_val_score(model, X, Y, cv=shuffle_split)
print(f"Accuracy: {results.mean() * 100:.3f}% ({results.std() * 100:.3f}%)")

Accuracy: 76.535% (2.235%)
