<a href="https://colab.research.google.com/github/HathawayQAQ/COMP551-Machine-Learning/blob/main/Assignment1/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd

!pip install ucimlrepo
# Import the required library
from ucimlrepo import fetch_ucirepo


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


# Task1: Acquire, preprocess, and analyze the data
## Data Preperation
1. Dataset1: Infrared Thermography Temperature (regression): [link](https://archive.ics.uci.edu/dataset/925/infrared+thermography+temperature+dataset)
2. Dataset 2: CDC Diabetes Health Indicators (classification): [link](https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators)

## Infrared Thermography Temperature

In [4]:
# Fetch the dataset by ID
infrared_thermography_temperature = fetch_ucirepo(id=925)

# data (as pandas dataframes)
X = infrared_thermography_temperature.data.features
y = infrared_thermography_temperature.data.targets

# metadata
print(infrared_thermography_temperature.metadata)

# variable information
print(infrared_thermography_temperature.variables)


{'uci_id': 925, 'name': 'Infrared Thermography Temperature', 'repository_url': 'https://archive.ics.uci.edu/dataset/925/infrared+thermography+temperature+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/925/data.csv', 'abstract': 'The Infrared Thermography Temperature Dataset contains temperatures read from various locations of inferred images about patients, with the addition of oral temperatures measured for each individual. The 33 features consist of gender, age, ethnicity, ambiant temperature, humidity, distance, and other temperature readings from the thermal images. The dataset is intended to be used in a regression task to predict the oral temperature using the environment information as well as the thermal image readings. ', 'area': 'Health and Medicine', 'tasks': ['Regression'], 'characteristics': ['Tabular'], 'num_instances': 1020, 'num_features': 33, 'feature_types': ['Real', 'Categorical'], 'demographics': ['Gender', 'Age', 'Ethnicity'], 'target_col': ['aveO

## CDC Diabetes Health Indicators

In [5]:
# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

# metadata
print(cdc_diabetes_health_indicators.metadata)

# variable information
print(cdc_diabetes_health_indicators.variables)


{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

# Task2: Implement the Models

## 1. Linear Regression (Analytical Solution)

In [6]:
import numpy as np

class LinearRegression:
    def __init__(self):
        self.theta = None

    def fit(self, X, y):
        # Add intercept to X
        X_b = np.c_[np.ones((X.shape[0], 1)), X]

        # Analytical solution (Normal Equation)
        self.theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

    def predict(self, X):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Add intercept to X
        return X_b.dot(self.theta)


## 2. Logistic Regression (with Gradient Descent)

In [7]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iter=1000):
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.theta = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def cost_function(self, X, y):
        m = len(y)
        predictions = self.sigmoid(X.dot(self.theta))
        cost = (-1 / m) * (y.dot(np.log(predictions)) + (1 - y).dot(np.log(1 - predictions)))
        return cost

    def fit(self, X, y):
        # Add intercept to X
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        m, n = X_b.shape
        self.theta = np.zeros(n)

        for i in range(self.n_iter):
            gradients = (1 / m) * X_b.T.dot(self.sigmoid(X_b.dot(self.theta)) - y)
            self.theta -= self.learning_rate * gradients

    def predict(self, X):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Add intercept
        return self.sigmoid(X_b.dot(self.theta)) >= 0.5  # Returns True/False


## 3. Mini-Batch Stochastic Gradient Descent

In [8]:
class MiniBatchSGD:
    def __init__(self, learning_rate=0.01, n_iter=1000, batch_size=32):
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.batch_size = batch_size
        self.theta = None

    def fit(self, X, y, model_type='linear'):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Add intercept
        m, n = X_b.shape
        self.theta = np.zeros(n)

        for epoch in range(self.n_iter):
            indices = np.random.permutation(m)
            X_b_shuffled = X_b[indices]
            y_shuffled = y[indices]

            for i in range(0, m, self.batch_size):
                xi = X_b_shuffled[i:i + self.batch_size]
                yi = y_shuffled[i:i + self.batch_size]

                if model_type == 'linear':
                    gradients = 2 / len(xi) * xi.T.dot(xi.dot(self.theta) - yi)
                elif model_type == 'logistic':
                    gradients = (1 / len(xi)) * xi.T.dot(self.sigmoid(xi.dot(self.theta)) - yi)

                self.theta -= self.learning_rate * gradients

    def predict(self, X, model_type='linear'):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        if model_type == 'linear':
            return X_b.dot(self.theta)
        elif model_type == 'logistic':
            return self.sigmoid(X_b.dot(self.theta)) >= 0.5

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))


# Task3: Run Experiments

## 1. Train/Test Split
80% for training and 20% for testing

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 2. Linear Regression Performance

In [10]:
# Train linear regression using analytical solution
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Predictions on training and test sets
y_train_pred = lin_reg.predict(X_train)
y_test_pred = lin_reg.predict(X_test)

# Evaluate using Mean Squared Error (MSE)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Linear Regression - Train MSE: {train_mse}")
print(f"Linear Regression - Test MSE: {test_mse}")


Linear Regression - Train MSE: 0.10075293892728054
Linear Regression - Test MSE: 0.09954843602165149


## 3. Logistic Regression Performance

In [None]:
# Train logistic regression using gradient descent
log_reg = LogisticRegression(learning_rate=0.01, n_iter=1000)
log_reg.fit(X_train, y_train)

# Predictions on training and test sets
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

# Evaluate using accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Logistic Regression - Train Accuracy: {train_accuracy}")
print(f"Logistic Regression - Test Accuracy: {test_accuracy}")


## 4. Growing Training Subsets (20%, 30%, 40%,...,80%)

In [None]:
train_sizes = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
train_errors = []
test_errors = []

for train_size in train_sizes:
    X_partial_train, _, y_partial_train, _ = train_test_split(X_train, y_train, train_size=train_size, random_state=42)

    lin_reg.fit(X_partial_train, y_partial_train)
    y_partial_train_pred = lin_reg.predict(X_partial_train)
    y_test_pred = lin_reg.predict(X_test)

    train_errors.append(mean_squared_error(y_partial_train, y_partial_train_pred))
    test_errors.append(mean_squared_error(y_test, y_test_pred))

# Plot the performance
import matplotlib.pyplot as plt
plt.plot(train_sizes, train_errors, label='Train MSE')
plt.plot(train_sizes, test_errors, label='Test MSE')
plt.xlabel('Training Set Size')
plt.ylabel('Mean Squared Error')
plt.title('Linear Regression Performance with Growing Training Set')
plt.legend()
plt.show()


## 5. Experiment with Mini-Batch Sizes

In [None]:
batch_sizes = [8, 16, 32, 64, 128]
train_mse_results = []
test_mse_results = []

for batch_size in batch_sizes:
    mb_sgd = MiniBatchSGD(learning_rate=0.01, n_iter=1000, batch_size=batch_size)
    mb_sgd.fit(X_train, y_train, model_type='linear')

    # Predictions and MSE
    y_train_pred = mb_sgd.predict(X_train, model_type='linear')
    y_test_pred = mb_sgd.predict(X_test, model_type='linear')

    train_mse_results.append(mean_squared_error(y_train, y_train_pred))
    test_mse_results.append(mean_squared_error(y_test, y_test_pred))

# Plot the performance
plt.plot(batch_sizes, train_mse_results, label='Train MSE')
plt.plot(batch_sizes, test_mse_results, label='Test MSE')
plt.xlabel('Batch Size')
plt.ylabel('Mean Squared Error')
plt.title('Performance with Different Mini-Batch Sizes')
plt.legend()
plt.show()


## 6. Vary Learning Rates

In [None]:
learning_rates = [0.001, 0.01, 0.1]
train_accuracies = []
test_accuracies = []

for lr in learning_rates:
    log_reg = LogisticRegression(learning_rate=lr, n_iter=1000)
    log_reg.fit(X_train, y_train)

    # Predictions and accuracy
    y_train_pred = log_reg.predict(X_train)
    y_test_pred = log_reg.predict(X_test)

    train_accuracies.append(accuracy_score(y_train, y_train_pred))
    test_accuracies.append(accuracy_score(y_test, y_test_pred))

# Plot the performance
plt.plot(learning_rates, train_accuracies, label='Train Accuracy')
plt.plot(learning_rates, test_accuracies, label='Test Accuracy')
plt.xlabel('Learning Rate')
plt.ylabel('Accuracy')
plt.title('Logistic Regression with Varying Learning Rates')
plt.legend()
plt.show()


## 7. Compare Analytical Linear Regression and Mini-Batch SGD

In [None]:
# Analytical solution for linear regression
lin_reg.fit(X_train, y_train)
y_pred_analytical = lin_reg.predict(X_test)
mse_analytical = mean_squared_error(y_test, y_pred_analytical)

# Mini-Batch SGD for linear regression
mb_sgd.fit(X_train, y_train, model_type='linear')
y_pred_sgd = mb_sgd.predict(X_test, model_type='linear')
mse_sgd = mean_squared_error(y_test, y_pred_sgd)

print(f"Analytical Linear Regression MSE: {mse_analytical}")
print(f"Mini-Batch SGD Linear Regression MSE: {mse_sgd}")
