In [None]:
# Author: Kevin Richard 
# tensorflow for Spam Filter
# Scalar Vector Model

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder, StandardScaler


In [None]:
#Download and import the spambase.data from the url.
!wget = https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data
filename = 'spambase.data'

--2023-02-09 01:47:02--  http://=/
Resolving = (=)... failed: Name or service not known.
wget: unable to resolve host address ‘=’
--2023-02-09 01:47:02--  https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 702942 (686K) [application/x-httpd-php]
Saving to: ‘spambase.data’


2023-02-09 01:47:03 (850 KB/s) - ‘spambase.data’ saved [702942/702942]

FINISHED --2023-02-09 01:47:03--
Total wall clock time: 1.4s
Downloaded: 1 files, 686K in 0.8s (850 KB/s)


In [None]:
# from google.colab import files
# files.upload()  # Choose File "spambase.data"

In [None]:
# Tarining and Testing data allocation.
testRatio = 0.5
dataset = np.loadtxt(filename, dtype='float32', delimiter=',')
target= dataset[:, -1]
labels = LabelEncoder().fit_transform(target)
features = dataset[:, 0:-1]
feature_std = StandardScaler().fit_transform(features)
x_train, x_test, y_train, y_test = train_test_split(feature_std, labels, test_size= testRatio, random_state = 0)

In [None]:
#Train & Test data set check size and features.
train_size, num_features = x_train.shape
test_size, test_features = x_test.shape

print(f"Train data size : {train_size}")
print(f"Number of Features : {num_features}")
print(f"Test data size : {test_size}")
print(f"Number of Features : {test_features}")

Train data size : 2300
Number of Features : 57
Test data size : 2301
Number of Features : 57


In [None]:
#Support Vector Model
class SupportVectorModel(object):

    def __init__(self, num_features):
        self.W = tf.Variable(tf.random.normal(shape=[num_features, 1]))
        self.b = tf.Variable(tf.random.normal(shape=[1, 1]))
        self.C = 0.001

    def __call__(self, X):
        return self.likelihood(X)

    def likelihood(self, X):
      raw_Y = tf.matmul(X, self.W) + self.b
      return raw_Y


    def predict(self, X):
      p = self.likelihood(X)
      return p

    # the loss function
    def compute_loss(self,y_pred, y_true):
      c_loss = 0.5 * tf.matrix_square_root(tf.matmul(tf.transpose(self.W),self.W))
      hinge_loss = tf.maximum(0, 1 - y_true * y_pred)
      loss = self.C*c_loss + hinge_loss

      return tf.reduce_mean(loss)


In [None]:
# accuracy function
def accuracy(Output_values,y_data):
  accuracy1 = 0
  # number of correct predictions
  for i in range(len(y_data)):
    if Output_values[i] >= 0:
      predicted_y = 1
    else:
      predicted_y = 0
    eq = tf.cast(tf.equal(predicted_y, y_test[i]), tf.float32)  # eq is 0 if model does not predict correctly, or 1 if correctly
    accuracy1 = accuracy1 + eq.numpy()

  return accuracy1/len(y_data)

In [None]:
# Define a training loop:
def training_loop(epochs,BATCH_SIZE,learning_rate):
  epoch_plot = []
  loss_plot = []
  model = SupportVectorModel(num_features)
  for i in epochs:
    rand_index = np.random.choice(train_size, size = BATCH_SIZE) # generate 100 random indices
    X = x_train[rand_index]  # given 100 random indices, choose 100 data points from train_data
    Y = np.transpose([y_train[rand_index]])  # get their true y values for those 100 data points
    Y = tf.reshape(Y, (-1, 1))  #new

    Y_label = np.where(Y <= 0, -1, 1)

    with tf.GradientTape() as tape:
      loss = model.compute_loss(model(X), Y_label) # compute the loss based on the model output and the true Y

    dW, db = tape.gradient(loss, [model.W, model.b])
    model.W.assign_sub(learning_rate * dW)
    model.b.assign_sub(learning_rate * db)

    # print epoch and loss value every 100th iteration.
    if i % 10 == 0:
      print("=> epoch %2d: loss= %.2f" %(i, loss.numpy()))
      epoch_plot.append(i)
      loss_plot.append(loss.numpy())


##############  training is finished !!  ################3

  # predict values for x_test data set
  predicted_values = model.predict(x_test) # predicted labels are probabilities

  # Accuracy of test Data:
  print("accuracy:",accuracy(predicted_values,y_test))
  return accuracy(predicted_values,y_test),epoch_plot,loss_plot


In [None]:
# epoch vs loss plot
def epoch_loss_plot(epoch_plot1, loss_plot1, lr_1, epoch_plot2, loss_plot2, lr_2, epoch_plot3, loss_plot3, lr_3):
  plt.plot(epoch_plot1, loss_plot1, label = f'{lr_1}', linewidth=2, color='blue')
  plt.plot(epoch_plot2, loss_plot2, label = f'{lr_2}', linewidth=2, color='orange')
  plt.plot(epoch_plot3, loss_plot3, label = f'{lr_3}', linewidth=2, color='green')
  plt.xlabel('epoch')
  plt.ylabel("loss")
  plt.title(f"epoch vs loss")
  plt.locator_params(axis='x', nbins=12)
  plt.legend(title = 'Learning Rate')
  plt.show()

In [None]:
# for epoch vs loss plots for varying Learning Rates

ep_1 = 500
bs_1 = 100
lr_1 = 1000
accuracy1, epoch_plot1, loss_plot1 = training_loop(range(ep_1),bs_1,lr_1)
ep_2 = 500
bs_2 = 100
lr_2 = 100
accuracy2, epoch_plot2, loss_plot2 = training_loop(range(ep_2),bs_2,lr_2)
ep_3 = 500
bs_3 = 100
lr_3 = 1
accuracy3, epoch_plot3, loss_plot3 = training_loop(range(ep_3),bs_3,lr_3)

epoch_loss_plot(epoch_plot1, loss_plot1,lr_1, epoch_plot2, loss_plot2,lr_2, epoch_plot3, loss_plot3,lr_3)


In [None]:
#Learning rate vs accuracy plot

def lr_accuracy_plot(accuracies, epos):
  plt.plot(epos, accuracies, linewidth=2, color='blue')
  plt.xlabel('lr')
  plt.ylabel("accuracy")
  plt.title(f"lr vs accuracy")
  plt.locator_params(axis='x', nbins=12)
  plt.show()

In [None]:
# for epoch vs loss plots for varying Learning Rates

accuracies=[]
lrs = []
epo = 1000
lr = 1
for i in range(10):
  lr = lr + i*10
  acc, _ , _ = training_loop(range(epo),100,lr)
  accuracies.append(acc)
  lrs.append(lr)

print("Logistic Regression")
lr_accuracy_plot(accuracies, lrs )



