In [1]:
import numpy as np
import random

# Implementation of softmax 

In [16]:
#generate a list of random numbers of 5 elements
elem_list= np.array([random.randint(100,150) for i in range(5)])
print(f'The generated sequence: {elem_list}')

#To avoid expoding exponents, we will use a bit of mathematical manipulation
logits= elem_list - np.max(elem_list)
print(f'The modified sequence: {logits}')

# This would get all elements to (min-max(elem_list)) to 0
#Now we calculate the exponents
numerator= np.exp(logits)
denominator= sum(np.exp(logits))
print(f'e^x = {numerator}')
print(f'Sum of all e^x = {denominator}')

output_of_softmax= numerator/denominator
print(f'Softmax output: {output_of_softmax}')


# Couple of theory pointers-
# 1) Subtracting max gives the same output as not subtracting, as the -max(elem_list) power gets cut out with each other when we consider 
# numerator and denominator together. I.e e^x-c is there in both num and denom, so -c gets cut out 

# 2) The reason why we take exponents is so that we assign a unique value to each element. Compared to a simple subtraction equation/division, exponents would evenly spread the distribution, and numbers close to the extremes are assigned a probability very close to 0 or 1

The generated sequence: [111 150 101 126 134]
The modified sequence: [-39   0 -49 -24 -16]
e^x = [1.15482242e-17 1.00000000e+00 5.24288566e-22 3.77513454e-11
 1.12535175e-07]
Sum of all e^x = 1.000000112572926
Softmax output: [1.15482229e-17 9.99999887e-01 5.24288507e-22 3.77513412e-11
 1.12535162e-07]


# Implementation of Sigmoid

In [35]:
# According to knowledge:
# Sigmoid: 1/1+e^(-x)

# Input list
elem_list = np.array([-10000, -100, 100, 10000])
print(f'Input List: {elem_list}')

# Method 1: Single formula (unstable)
start_time = time.time()
exponents = np.exp(-elem_list)
sigmoid_values = 1 / (1 + exponents)
end_time = time.time()
print(f'The output of using a single formula: {sigmoid_values}')
print(f'Time taken for single formula: {end_time - start_time:.6f} seconds\n')

# Resolving the overflow-
# When x is -infinity, e^-x becomes too large and 1+ e^-x is also large. Hence, when we divide 1 by this large number, it underflows
# SO instead of this form of the equation, we use something which will not underflow or overflow, i.e. e^x/(1+e^x), which is just the expanded form of the original equation, but
# in this, when x tends to -infinity, numerator will underflow, but it is not a problem as denominator compensates for it
# Insane- even this will throw an error. That is because NumPy's where function does not work like an if else condition, it will calculate for both and then pick the one based condition

# Method 2: Using numpy.where (partially stable)
start_time = time.time()
sigmoid_values = np.where(
    elem_list < 0,
    np.exp(elem_list) / (1 + np.exp(elem_list)),
    1 / (1 + np.exp(-elem_list))
)
end_time = time.time()
print(f'Using numpy.where function for different formula for +/-ve elements: {sigmoid_values}')
print(f'Time taken for numpy.where: {end_time - start_time:.6f} seconds\n')

# Method 3: Using Pythonic if-else (stable)
start_time = time.time()
sigmoid_values = []
for i in elem_list:
    if i < 0:
        sigmoid_values.append(np.exp(i) / (1 + np.exp(i)))
    else:
        sigmoid_values.append(1 / (1 + np.exp(-i)))
sigmoid_values = np.array(sigmoid_values)
end_time = time.time()
print(f'Using split formula using if-else function: {sigmoid_values}')
print(f'Time taken for Pythonic if-else: {end_time - start_time:.6f} seconds\n')

# Note:
# 1) Sigmoid always tends to 0 or 1 but never touches
# 2) Sigmoid is softmax function for 2 classes only

Input List: [-10000   -100    100  10000]
The output of using a single formula: [0.00000000e+00 3.72007598e-44 1.00000000e+00 1.00000000e+00]
Time taken for single formula: 0.000128 seconds

Using numpy.where function for different formula for +/-ve elements: [0.00000000e+00 3.72007598e-44 1.00000000e+00 1.00000000e+00]
Time taken for numpy.where: 0.000134 seconds

Using split formula using if-else function: [0.00000000e+00 3.72007598e-44 1.00000000e+00 1.00000000e+00]
Time taken for Pythonic if-else: 0.000121 seconds



  exponents = np.exp(-elem_list)
  np.exp(elem_list) / (1 + np.exp(elem_list)),
  np.exp(elem_list) / (1 + np.exp(elem_list)),
  1 / (1 + np.exp(-elem_list))


# Implementation of Min max scaling

In [5]:
# The goal is to bring a set of elements to 0 to 1 range. Intuitively, we want to bring the lowest element to 0 and biggest element to 1
# So we subtract all elements by the minimum first, which will ensure the lowest element goes to 0. Then, to ensure the largest element goest to 1, we divide this
# numerator with max-min, as if x is the max element, to result in 1, we need to divide x - min by max-min

#Generating a random list of 5 elements
elem_list = np.array([random.randint(100,1000) for _ in range(5)])
print(f'The original list is: {elem_list}')

#Storing the min value so that we don't do double calculation
list_min= np.min(elem_list)

# Plugging into the formula
scaled_list= (elem_list- list_min)/(np.max(elem_list)-list_min)

print(f'The scaled list is: {np.round(scaled_list,3)}')

The original list is: [589 292 425 942 408]
The scaled list is: [0.457 0.    0.205 1.    0.178]


In [17]:
# Euclidean distance

# Euclidean distance is the most basic distance measure (after manhattan distance) that one could implement. This should be super quick

# Formula: sqrt( x distance squared + y distance squared)
# Oh but wait, for n dimensions, it would be (sum(xi-yi)^2)^0.5

# generate a 4 dimensional array
random_4d_array_1 = np.random.rand(1,1,1,2,2)
random_4d_array_2= np.random.rand(1,1,1,2,2)

print("Array 1 contents:\n", random_4d_array_1)
print("Array 2 contents:\n", random_4d_array_2)

euclidean_dist= np.sqrt(np.sum(np.square(random_4d_array_1-random_4d_array_2)))
print(f'The euclidean distance between these n-dim arrays is: {euclidean_dist}')

# print(arrays)

Array 1 contents:
 [[[[[0.9521847  0.27736915]
    [0.02765174 0.30297017]]]]]
Array 2 contents:
 [[[[[0.79863372 0.31426173]
    [0.80371021 0.96800607]]]]]
The euclidean distance between these n-dim arrays is: 1.0341559216259615


# Implementation of Mahalanobis distance

In [63]:


# THis is a fancy distance taught in data mining class and used mainly to accoud for correlation plust different scales between two variables. Let's try to implement
# it using numpy

# Some theory- Mahalanobis distance essentially measures the distance between two points relative to a dataset. Rather than relying on Euclidean estimates, 
# which can fail in a lot of cases, this is a more customized distance which helps to determine how far data points are based on correlation and scale of features

# Formula: Square root of (x-y (transpose) . inverse of covariance . x-y)

# generating n random 3d arrays:
n=5
random_4d_array = np.random.rand(n,3)

print("Array 1 contents:\n", random_4d_array)

# calculating covariance between the arrays (we do row_var=False as we want to consider our data set in the column format as covariance is between columns)
covariance= np.cov(random_4d_array, rowvar=False)
print(f'The covariance between all the features: {covariance}')

# Let's say we want to find distance between point 1 and point 2
point1 = random_4d_array[0]
point2 = random_4d_array[1]
diff_bw_input_arr= point1-point2

# Multiple learnings-
# In python: @ is for dot product
# np.linalg.inv is required for inverting a matrix since the covariance is not a fixed number
# There are two ways to transpose, .T or np.transpose()

mahalanobis_dist= np.sqrt(diff_bw_input_arr @ np.linalg.inv(covariance) @ diff_bw_input_arr)
print(f'The mahalanobis distance is: {mahalanobis_dist}')



# Use cases:
# outlier detection between financial transactions
# outlier detection in manufacturing (sensor data)
# outlier detection in patient health data


Array 1 contents:
 [[0.24866596 0.89507646 0.6725381 ]
 [0.51994643 0.32368479 0.9081165 ]
 [0.29786276 0.11547373 0.05686165]
 [0.79839683 0.87496723 0.7025378 ]
 [0.46081942 0.84149323 0.37804283]]
The covariance between all the features: [[0.0472313  0.02315534 0.03179219]
 [0.02315534 0.13289947 0.04422086]
 [0.03179219 0.04422086 0.10977048]]
The mahalanobis distance is: 2.495460926259866


# Implementation of Cross Entropy Loss

In [6]:
# Cross entropy loss fits well with classification problems and deep learning networks, as they couple with softmax outputs
# The reason is that cross entropy penalizes wrong predictions significantly (higher the different between actual and predicted output, higher the amplification 
# in the loss function. This amplification happens because of the log function in the equation. This is also why cross entropy allows faster training, as amplified gradients speed up the process

# Formula: - sum( actual output value (y) x log predicted probability (y_hat)

# Generating 2 arrays, one with the actual output, one with the predicted output
actual_output= [0, 0, 1, 0]
predicted_output = [0.8, 0.1, 0.05, 0.05]

# One drawback of the normal formula is that when probability tends to 0, the log function can go really low and the loss value can shoot up a lot
# So we keep a cap
predicted_output= np.clip(predicted_output, a_max= 1, a_min=1e-12)

# Loss calculation
cross_entropy_loss = -(np.sum(np.array(actual_output) * np.log(np.array(predicted_output))))
print(cross_entropy_loss)

2.995732273553991


# Gradient Descent/Stochastic Gradient Descent

In [27]:
# Gradient descent is relatively simple but super powerful
# However, implementation gets complex one we have multiple features 

import numpy as np

# Generating a random dataset (3 samples, 1 feature)
data = np.random.rand(3,2)
print(f'Original Data: \n{data}')

x = data[:, 0].reshape(-1, 1)  # Shape (3,1)
y = data[:, 1]  # Shape (3,)

# weight and bias
W = np.random.rand(1, 1) 
bias = 0.2
print(f'Old weights: {W}')

# Predicted output
y_pred = (x @ W) + bias

# Mean Squared Error (MSE)
mse = np.mean(np.square(y - y_pred))
print(f'Mean Squared Error: {mse}')

# Compute gradient of MSE w.r.t. W
gradient_W = -(2/len(x)) * np.sum(x * (y - y_pred))  # Correct formula
gradient_b = -(2/len(x)) * np.sum(y - y_pred)  # Gradient for bias

print(f'The gradient w.r.t W: {gradient_W}')
print(f'The gradient w.r.t bias: {gradient_b}')

# Gradient Descent Step
learning_rate = 0.001
W_new = W - learning_rate * gradient_W  # Update weight
bias_new = bias - learning_rate * gradient_b  # Update bias

print(f'New weights: {W_new}')
print(f'New bias: {bias_new}')



Original Data: 
[[0.67298596 0.38498319]
 [0.10279585 0.25651577]
 [0.49949157 0.52126978]]
Old weights: [[0.98223625]]
Mean Squared Error: 0.11951138274323655
The gradient w.r.t W: 0.9221518240504133
The gradient w.r.t bias: 1.3797019884386448
New weights: [[0.9813141]]
New bias: 0.19862029801156136
