**EC9630 Machine Learning - Laboratory 03**

**Task: NONLINEAR MODELS**

**Name: Lakshan W.G.**

**Reg No: 2020/E/079**

**Date: 02 Aug 2024**

**Time: 08.00 a.m**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd

# Define the path to your dataset
file_path = '/content/drive/MyDrive/Labs/diabetes_012_health_indicators_BRFSS2015.csv'

# Load the dataset
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()



Mounted at /content/drive


Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


**Handling Missing values**

In [2]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

# Check data types of the variables
data_types = data.dtypes
print("Data Types:\n", data_types)


Missing Values:
 Diabetes_012            0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64
Data Types:
 Diabetes_012            float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
A

### There is no missing values and categorical variables. Therefore we can skip this steps

### Divide the dataset into Input (X) and Target (y).

In [3]:
# Separate the features and the target variable
X = data.drop(columns=['Diabetes_012'])
y = data['Diabetes_012']

# Display the shapes of X and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (253680, 21)
Shape of y: (253680,)


### Create the training, validation and test datasets separately. (Use 70%, 20%, 10% respectively)

In [4]:
# Import the train_test_split function
from sklearn.model_selection import train_test_split

# Split the data into training (70%) and temporary (30%) datasets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary dataset into validation (20% of total) and test (10% of total) datasets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)

# Display the shapes of the datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)


Shape of X_train: (177576, 21)
Shape of X_val: (50989, 21)
Shape of X_test: (25115, 21)


### Fit a nonlinear model using Radial Basis Functions (RBF) on the training data.

In [5]:
# Implement RBF kernel functions

from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier

# Create an RBF sampler
rbf_feature = RBFSampler(gamma=1, random_state=42)

# Transform the training data
X_train_features = rbf_feature.fit_transform(X_train)
X_val_features = rbf_feature.transform(X_val)
X_test_features = rbf_feature.transform(X_test)


In [6]:
# Train the model using appropriate optimization techniques

# Train an SGD classifier with the RBF features
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train_features, y_train)

# Validate the model
val_score = sgd_clf.score(X_val_features, y_val)
print("Validation Score with RBF Kernel:", val_score)


Validation Score with RBF Kernel: 0.8424169919002138


In [7]:
# Tune hyperparameters such as the number of basic functions and regularization strength

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'rbfsampler__gamma': [0.1, 0.5, 1, 2, 5],
    'sgdclassifier__alpha': [0.0001, 0.001, 0.01, 0.1]
}

# Create a pipeline with RBF Sampler and SGD Classifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('rbfsampler', RBFSampler(random_state=42)),
    ('sgdclassifier', SGDClassifier(random_state=42))
])

# Perform Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)


Best Parameters: {'rbfsampler__gamma': 0.1, 'sgdclassifier__alpha': 0.0001}
Best Cross-Validation Score: 0.8420225706176511


### Apply a Perceptron on this training data

---



In [8]:
from sklearn.linear_model import Perceptron

# Train a Perceptron
perceptron = Perceptron(random_state=42)
perceptron.fit(X_train, y_train)

# Validate the Perceptron model
val_score = perceptron.score(X_val, y_val)
print("Validation Score with Perceptron:", val_score)


Validation Score with Perceptron: 0.844574319951362


### Apply a Multi-Layer Perceptron (MLP) on the training data

In [9]:
# Design the architecture of the MLP network.

from sklearn.neural_network import MLPClassifier

# Train an MLP Classifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=5, random_state=42)
mlp_clf.fit(X_train, y_train)

# Validate the MLP model
val_score = mlp_clf.score(X_val, y_val)
print("Validation Score with MLP:", val_score)


Validation Score with MLP: 0.8485163466630058




### Implement Forward Propagation and Backpropagation Algorithms handled internally by MLPClassifier.

### Train the Model using Gradient Descent or Its Variants handled internally by MLPClassifier.

In [10]:
# Experiment with different activation functions and network structures

from sklearn.neural_network import MLPClassifier

# Experiment with different activation functions and network structures
activation_functions = ['identity', 'logistic', 'tanh', 'relu']
hidden_layer_structures = [(50,), (100,), (50, 50), (100, 50, 25)]

for activation in activation_functions:
    for structure in hidden_layer_structures:
        print(f"Training MLP with activation={activation} and hidden_layer_sizes={structure}")
        mlp_clf = MLPClassifier(hidden_layer_sizes=structure, activation=activation, max_iter=5, random_state=42)
        mlp_clf.fit(X_train, y_train)
        val_score = mlp_clf.score(X_val, y_val)
        print(f"Validation Score with activation={activation} and hidden_layer_sizes={structure}: {val_score}")


Training MLP with activation=identity and hidden_layer_sizes=(50,)




Validation Score with activation=identity and hidden_layer_sizes=(50,): 0.8355331542097315
Training MLP with activation=identity and hidden_layer_sizes=(100,)




Validation Score with activation=identity and hidden_layer_sizes=(100,): 0.8448488889760537
Training MLP with activation=identity and hidden_layer_sizes=(50, 50)




Validation Score with activation=identity and hidden_layer_sizes=(50, 50): 0.8444370354390163
Training MLP with activation=identity and hidden_layer_sizes=(100, 50, 25)




Validation Score with activation=identity and hidden_layer_sizes=(100, 50, 25): 0.8424562160465983
Training MLP with activation=logistic and hidden_layer_sizes=(50,)




Validation Score with activation=logistic and hidden_layer_sizes=(50,): 0.8493204416638883
Training MLP with activation=logistic and hidden_layer_sizes=(100,)




Validation Score with activation=logistic and hidden_layer_sizes=(100,): 0.8486144070289671
Training MLP with activation=logistic and hidden_layer_sizes=(50, 50)




Validation Score with activation=logistic and hidden_layer_sizes=(50, 50): 0.8494185020298496
Training MLP with activation=logistic and hidden_layer_sizes=(100, 50, 25)




Validation Score with activation=logistic and hidden_layer_sizes=(100, 50, 25): 0.848771303614505
Training MLP with activation=tanh and hidden_layer_sizes=(50,)




Validation Score with activation=tanh and hidden_layer_sizes=(50,): 0.8479868206868149
Training MLP with activation=tanh and hidden_layer_sizes=(100,)




Validation Score with activation=tanh and hidden_layer_sizes=(100,): 0.8481241051991606
Training MLP with activation=tanh and hidden_layer_sizes=(50, 50)




Validation Score with activation=tanh and hidden_layer_sizes=(50, 50): 0.8473592343446625
Training MLP with activation=tanh and hidden_layer_sizes=(100, 50, 25)




Validation Score with activation=tanh and hidden_layer_sizes=(100, 50, 25): 0.8465943634901646
Training MLP with activation=relu and hidden_layer_sizes=(50,)




Validation Score with activation=relu and hidden_layer_sizes=(50,): 0.8462609582458962
Training MLP with activation=relu and hidden_layer_sizes=(100,)




Validation Score with activation=relu and hidden_layer_sizes=(100,): 0.8485163466630058
Training MLP with activation=relu and hidden_layer_sizes=(50, 50)




Validation Score with activation=relu and hidden_layer_sizes=(50, 50): 0.845554923610975
Training MLP with activation=relu and hidden_layer_sizes=(100, 50, 25)
Validation Score with activation=relu and hidden_layer_sizes=(100, 50, 25): 0.8453980270254369




In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50, 25)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'batch_size': [16, 32, 64]
}

# Create an MLP Classifier
mlp_clf = MLPClassifier(max_iter=5, random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(mlp_clf, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

# Validate the best model on the validation set
best_model = grid_search.best_estimator_
val_score = best_model.score(X_val, y_val)
print("Validation Score with Best Model:", val_score)




### Evaluate the performance of the nonlinear models on the test data.