## Task 1: Dataset Selection

Data Preprocessing:
- Use visualizations (histograms, scatter plots, bar graphs, etc.) to understand the
distribution of features and identify any potential patterns/dependencies or
outliers.
- Identify the data types of each feature (numeric, categorical, text, etc.). For
numeric data, show its characteristics like mean, median, standard deviation, etc.
- Identify and handle missing values (null values) in the data. This could involve
removing rows with missing values, fixing missing values with appropriate
strategies, etc.

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import random
import math

### Loading  Data 

In [7]:
df = pd.read_csv("HCV-Egy-Data.csv")
df

Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,Fatigue & generalized bone ache,Jaundice,Epigastric pain,...,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF,Baseline histological Grading,Baselinehistological staging
0,56,1,35,2,1,1,1,2,2,2,...,5,5,5,655330,634536,288194,5,5,13,2
1,46,1,29,1,2,2,1,2,2,1,...,57,123,44,40620,538635,637056,336804,31085,4,2
2,57,1,33,2,2,2,2,1,1,1,...,5,5,5,571148,661346,5,735945,558829,4,4
3,49,2,33,1,2,1,2,1,2,1,...,48,77,33,1041941,449939,585688,744463,582301,10,3
4,59,1,32,1,1,2,1,2,2,2,...,94,90,30,660410,738756,3731527,338946,242861,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1380,44,1,29,1,2,2,2,1,1,1,...,63,44,45,387795,55938,5,5,5,15,4
1381,55,1,34,1,2,2,1,1,1,1,...,97,64,41,481378,152961,393339,73574,236273,10,2
1382,42,1,26,2,2,1,1,1,2,1,...,87,39,24,612664,572756,806109,343719,160457,6,2
1383,52,1,29,2,1,1,2,2,2,1,...,48,81,43,139872,76161,515730,2460,696074,15,3


In [9]:
class Neuron:
    def __init__(self, number_of_inputs,inputs):
        self.number_of_inputs = number_of_inputs
        self.weights = []
        self.inputs = inputs
        self.bias = random.uniform(-5, 5) 
        for i in range(number_of_inputs):
            self.weights.append(random.uniform(-10, 10) )

    def setInputs(self,inputs):
        self.inputs = inputs
        
    def Z(self):
        z = 0
        for i in range(self.number_of_inputs):
            z += self.weights[i] * self.inputs[i]
        z += self.bias
        return z
    
    def sigmoid(self):
        return 1 / (1 + math.exp(-self.Z()))
    
    def differentiationOfSigmoid(self):
        return self.sigmoid() * (1 - self.sigmoid())
    
    def differentiationOfZ(self,target): # w1674368   x468287  y4893804   w3   x5   y8
        index = int(target[1:]) -1
        if "w" in target.lower():
            return self.inputs[index]
        elif "y" in target.lower() or "x" in target.lower():
            # Your code here
            return self.weights[index]

### Data Exploration

In [None]:
df.hist(figsize=(40,20))

In [None]:
plt.figure(figsize=(28, 28))  # Adjust the width and height as needed

sns.heatmap(df.corr(), cmap='Reds', annot=True, fmt='.2f')

In [None]:
df.isnull().sum()

In [None]:
num_features = 29
fig, axes = plt.subplots(nrows=num_features, ncols=1, figsize=(10, 5 * num_features))

# Loop through each feature to create a PDF plot
for i, column in enumerate(df):
    # Plotting the density
    df[column].plot(kind='density', ax=axes[i], color='blue', alpha=0.5, label='PDF')

    # Calculate mean, median, and mode
    mean = df[column].mean()
    median = df[column].median()
    mode = df[column].mode()[0]

    # Marking mean, median, and mode with horizontal lines
    axes[i].axvline(x=mean , color='red', linestyle='--', label=f'Mean: {mean:.2f}')  # Adjusted y-value for visibility
    axes[i].axvline(x=median, color='green', linestyle='--', label=f'Median: {median:.2f}')  # Adjusted y-value for visibility
    axes[i].axvline(x=mode , color='purple', linestyle='--', label=f'Mode: {mode:.2f}')  # Adjusted y-value for visibility

    # Adding titles and labels
    axes[i].set_title(f'Probability Density Function for {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid()

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
pd.set_option('display.max_rows', None)
df.dtypes

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.plot(kind='box', subplots=True, sharex=False, sharey=False, layout=(10,3), figsize=(15,25))

In [None]:
Q1 = np.percentile(df['ALT after 24 w'], 25)
Q3 = np.percentile(df['ALT after 24 w'], 75)
IQR = Q3 - Q1
df = df[(df['ALT after 24 w'] >= Q1 - 1.5 * IQR) & (df['ALT after 24 w'] <= Q3 + 1.5 * IQR)]

Q1 = np.percentile(df['RNA 12'], 25)
Q3 = np.percentile(df['RNA 12'], 75)
IQR = Q3 - Q1
df = df[(df['RNA 12'] >= Q1 - 1.5 * IQR) & (df['RNA 12'] <= Q3 + 1.5 * IQR)]

df.plot(kind='box', subplots=True, sharex=False, sharey=False, layout=(10,3), figsize=(15,25))

# Task 2: Update NN weights using Genetic Algorithm

### Shuffle and split data

In [8]:
shuffled_data = df.sample(frac=1)
"""
This script performs the following operations on a DataFrame `df`:

1. Shuffles the DataFrame.
2. Splits the DataFrame into features (X) and target (Y).
3. Splits the data into training and testing sets with a 70-30 split.
4. Further splits the testing set into validation and final test sets with a 2/3-1/3 split.

Variables:
    shuffled_data (DataFrame): The shuffled DataFrame.
    X (ndarray): Feature matrix.
    Y (ndarray): Target vector.
    X_train (ndarray): Training feature matrix.
    m_test (ndarray): Intermediate test feature matrix.
    Y_train (ndarray): Training target vector.
    my_test (ndarray): Intermediate test target vector.
    X_validate (ndarray): Validation feature matrix.
    X_test (ndarray): Final test feature matrix.
    Y_validate (ndarray): Validation target vector.
    Y_test (ndarray): Final test target vector.
"""

X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values

X_train, m_test, Y_train, my_test = train_test_split(X, Y, test_size = 0.30, random_state = 0)

X_validate, X_test, Y_validate, Y_test = train_test_split(m_test, my_test, test_size = 1/3, random_state = 0)


### Build Neural Network

In [None]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

sigs = np.vectorize(sigmoid)

def targetToVector(target):
   """
   Converts a target integer into a one-hot encoded vector of length 4.

   Args:
      target (int): The target integer, expected to be in the range 1 to 4.

   Returns:
      list: A one-hot encoded list of length 4, where the position corresponding 
         to the target integer is set to 1, and all other positions are set to 0.

   Example:
      >>> targetToVector(3)
      [0, 0, 1, 0]
   """
   return [1 if i+1 == target else 0 for i in range(4)]

def fitness(predictedVector, targetVector):
   """
   Calculate the fitness score by computing the sum of absolute differences 
   between the predicted vector and the target vector.

   Args:
      predictedVector (list of float): The predicted values.
      targetVector (list of float): The actual target values.

   Returns:
      float: The fitness score, which is the sum of absolute differences 
      between the predicted and target vectors.
   """
   return sum([abs(targetVector[i] - predictedVector[i]) for i in range(len(predictedVector))])


def randomPopulation(size=500):
   """
   Generates a random population of neural network weights and biases.

   Args:
      size (int): The number of neural networks to generate. Default is 500.

   Returns:
      tuple: A tuple containing four elements:
         - W1s (list of list of list of float): Weights for the first layer of each neural network.
         - W2s (list of list of list of float): Weights for the second layer of each neural network.
         - B1s (list of list of float): Biases for the first layer of each neural network.
         - B2s (list of list of float): Biases for the second layer of each neural network.
   """
   W1s = [[[random.uniform(-10, 10) for i in range(28)] for j in range(10)] for count in range(size)]
   W2s = [[[random.uniform(-10, 10) for i in range(10)] for j in range(4)] for count in range(size)]
   B1s = [[random.uniform(-5, 5) for i in range(10)] for count in range(size)]
   B2s = [[random.uniform(-5, 5) for i in range(4)] for count in range(size)]
   return W1s, W2s, B1s, B2s

def crossoverWeights(rate, population):
    """
   Perform crossover on a population of genetic algorithms.
   This function takes a population of genetic algorithms and performs crossover 
   operations to produce a new population. The crossover operation combines the 
   weights of two parent genes to produce new offspring genes.
   Args:
      rate (float): The crossover rate, which determines the proportion of the 
                 population that will undergo crossover.
      population (list): The population of genetic algorithms, where each 
                     individual is represented as a list of weight vectors.
   Returns:
      list: The new population after crossover, which includes both the crossed 
           individuals and the remaining individuals from the original population.
   """
    tmp_pop = population.copy()
    crossed_pop = []

    def cross(g1, g2):
        cp = random.randrange(0, len(g1))
        tg1_1 = g1[:cp]
        tg1_2 = g1[cp:]
        tg2_1 = g2[:cp]
        tg2_2 = g2[cp:]
        return (tg1_1 + tg2_2, tg2_1 + tg1_2)
    
    for i in range(int(rate/2*len(population))):
        parentGene1 = tmp_pop[random.randrange(0, len(tmp_pop))]
        tmp_pop.remove(parentGene1)
        parentGene2 = tmp_pop[random.randrange(0, len(tmp_pop))]
        tmp_pop.remove(parentGene2)


        idx1, idx2 = random.randrange(0, len(parentGene1[0])), random.randrange(0, len(parentGene1[0]))
        parentWeightVector1 = parentGene1[0][idx1]
        parentWeightVector2 = parentGene2[0][idx2]                           
         
        new_w1_p1, new_w1_p2 = cross(parentWeightVector1 , parentWeightVector2)

        parentGene1[0][idx1] = new_w1_p1
        parentGene2[0][idx2] = new_w1_p2

        idx1, idx2 = random.randrange(0, len(parentGene1[1])), random.randrange(0, len(parentGene1[1]))

        parentWeightVector1 = parentGene1[1][idx1]
        parentWeightVector2 = parentGene2[1][idx2]                           
         
        new_w2_p1, new_w2_p2 = cross(parentWeightVector1 , parentWeightVector2)

        parentGene1[1][idx1] = new_w2_p1
        parentGene2[1][idx2] = new_w2_p2


        #print("crossed: ", (new_w1_p1, new_w2_p1, parentGene1[2], parentGene1[3]))
        crossed_pop.append(parentGene1)
        crossed_pop.append(parentGene2)
        #print(len(crossed_pop), len(tmp_pop), len(crossed_pop + tmp_pop))
    return crossed_pop + tmp_pop

def mutationWeights(rate, weightCount, population):
   """
   Applies mutation to a population of neural network weights.
   Parameters:
   rate (float): The mutation rate, a value between 0 and 1, indicating the probability of mutation for each gene.
   weightCount (int): The number of weights in each gene.
   population (list): A list of genes, where each gene is a tuple containing two lists of weights.
   Returns:
   list: A new population with mutated weights based on the given mutation rate.
   """
   newPop = population.copy()
   limit = rate*(len(population))
   for i, gene in enumerate(population):
      #print(gene)
      if (random.random() <= rate):
         if (limit > 0):
            limit -=1
            for j in range(weightCount):
              newG = gene
              idx = random.randrange(0, len(gene[0]))
              rndW = random.randrange(0, len(gene[0][idx]))
              newG[0][idx][rndW] = random.uniform(-10, 10)
              idx = random.randrange(0, len(gene[1]))
              rndW = random.randrange(0, len(gene[1][idx]))
              newG[1][idx][rndW] = random.uniform(-10, 10)
              
              newPop[i] = newG
         else:
            break 
         
   return newPop

def mutationBias(rate,  population):
   def mutationBias(rate, population):
      """
      Apply mutation bias to a given population of genes.
      Parameters:
      rate (float): The mutation rate, a value between 0 and 1, representing the probability of mutation for each gene.
      population (list): A list of genes, where each gene is a list containing sublists that represent different parts of the gene.
      Returns:
      list: A new population list with mutations applied based on the given rate.
      Notes:
      - The function creates a copy of the population to avoid modifying the original population.
      - The mutation is applied by randomly selecting indices in the gene sublists and assigning them new values within the range [-5, 5].
      - The number of mutations is limited by the product of the mutation rate and the population size.
      """
   newPop = population.copy()
   limit = rate*(len(population))
   for i, gene in enumerate(population):
      if (random.random() <= rate):
         if (limit > 0):
            limit -=1
            newG = gene
            idx = random.randrange(0, len(gene[0]))
            newG[2][idx] = random.uniform(-5, 5)
            idx = random.randrange(0, len(gene[1]))
            newG[3][idx] = random.uniform(-5, 5)
         else:
            break 
         
   return newPop
   
def selection(rate, population, fitness):
   """
   Selects a subset of the population based on their fitness scores.

   This function sorts the population based on their fitness scores and selects the top half of the population.
   If the population size is odd, it includes one additional individual from the top half.

   Args:
      rate (float): The selection rate (not used in the current implementation).
      population (list): The list of individuals in the population.
      fitness (list): The list of fitness scores corresponding to each individual in the population.

   Returns:
      list: A list containing the selected individuals from the population.
   """
   #print("pop in select", len(population))
   full = list(map(lambda y: y[0], sorted([(population[i], fitness[i]) for i in range(len(population))], key = lambda x: x[1])))
   #print("in select: ",len(full[:int(len(population)/2)] + (full[:int(len(population)/2)] if len(population) % 2 == 0 else full[int(len(population)/2):])))
   #print(len(full[:int(len(population)/2)]), len(full[:int(len(population)/2)] if len(population) % 2 == 0 else full[int(len(population)/2):]), len(full[:int(len(population)/2)] + (full[:int(len(population)/2)] if len(population) % 2 == 0 else full[:int(len(population)/2)+1])))
   return full[:int(len(population)/2)] + (full[:int(len(population)/2)] if len(population) % 2 == 0 else full[:int(len(population)/2)+1])

def genPopulationTuples(W1s, W2s, B1s, B2s):
   """
   Generates a population of tuples from the given lists of weights and biases.
   This function takes four lists of weights and biases, and generates a list of 
   tuples where each tuple is a combination of one element from each list.
   Parameters:
   W1s (list): A list of weights for the first layer.
   W2s (list): A list of weights for the second layer.
   B1s (list): A list of biases for the first layer.
   B2s (list): A list of biases for the second layer.
   Returns:
   list: A list of tuples, where each tuple contains one weight from W1s, one weight 
        from W2s, one bias from B1s, and one bias from B2s.
   """
   tuplePopulation = []

   for w1 in W1s:
      for w2 in W2s:
         for b1 in B1s:
            for b2 in B2s:  
               tuplePopulation.append((w1, w2, b1, b2))

   return tuplePopulation

def forwardPass(population, dataPoint, target, fitness_results):
   """
   Perform a forward pass through a population of neural networks and update their fitness scores.
   Args:
      population (list of tuples): Each tuple contains weights and biases for a neural network.
                             Format: [(w1, w2, b1, b2), ...]
      dataPoint (numpy array): Input data point to be fed into the neural networks.
      target (int or float): The target value for the given data point.
      fitness_results (list of floats): List to store the fitness scores of the neural networks.
   Returns:
      None: The function updates the fitness_results list in place.
   """
   idx = 0

   tuplePopulation = []

   for w1, w2, b1, b2 in population :   # (w1,w2,b1,b2)
      midResult = np.dot(w1, dataPoint)
      actv_midResult = sigs(midResult + b1)
               
      finalResult = np.dot(w2, actv_midResult)
      actv_finalResult = sigs(finalResult + b2)
      #print(actv_finalResult)
      fitness_results[idx] += fitness(actv_finalResult, targetToVector(target))
      idx+=1

def calculateErrorOf1Input(predictedVector, targetVector):  # 1/2 * sum(target - predicted)^2
   error = 0
   for i in range(len(targetVector)):
      error += (targetVector[i]-predictedVector[i])**2   # target : [0, 0, 1, 0]   predicted: [0.1, 0.2, 0.7, 0]
   return error * 0.5

def train(data , expectedOutput , hiddenLayer, outputLayer):
      hiddenLayerOutputs = []
      outputLayerOutputs = []
      hotY = targetToVector(expectedOutput)
      for neuron in hiddenLayer:
         neuron.setInputs(data)
         hiddenLayerOutputs.append(neuron.sigmoid())
      for i in range(len(outputLayer)):
         outputLayer[i].setInputs(hiddenLayerOutputs)
         outputLayerOutputs.append(outputLayer[i].sigmoid())
      return calculateErrorOf1Input(outputLayerOutputs, hotY) , hiddenLayerOutputs, outputLayerOutputs
      

      


   

def test_AymanAndOmarNour(labeled_input_matrix, weights_matrix):
   """
   Tests the neural network with the given labeled input matrix and weights matrix.
   Args:
      labeled_input_matrix (list of lists): A matrix where each row represents a labeled input.
         The first element of each row is the input vector, and the second element is the target output.
      weights_matrix (list of numpy arrays): A list containing the weight matrices for the neural network.
         weights_matrix[0] is the weight matrix for the input to hidden layer,
         weights_matrix[1] is the weight matrix for the hidden to output layer,
         weights_matrix[2] is the bias vector for the hidden layer,
         weights_matrix[3] is the bias vector for the output layer.
   Returns:
      tuple: A tuple containing:
         - Error_Vector (list): A list of error values for each input in the labeled input matrix.
         - Error (float): The total error for all inputs in the labeled input matrix.
   """
   idx = 0

   x = labeled_input_matrix[0]
   y = labeled_input_matrix[1]

   Error_Vector = [0] * len(y)
   Error = 0

   for i in range(len(y)):
      #print(len(lblEd))
      yV = targetToVector(y[i])
      
      midResult = np.dot(weights_matrix[0], x[i])
      actv_midResult = sigs(midResult + weights_matrix[2])
               
      finalResult = np.dot(weights_matrix[1], actv_midResult)
      actv_finalResult = sigs(finalResult + weights_matrix[3])
      Error_Vector[idx] = fitness(actv_finalResult, yV)
      Error += fitness(actv_finalResult, yV)
      idx += 1 

   return Error_Vector, Error

def error_AymanAndOmarNour(Actual_Outputs_Vector, Target_Output_Vector):
   """
   Calculate the absolute error between actual outputs and target outputs.
   Args:
      Actual_Outputs_Vector (list): A list of actual output values.
      Target_Output_Vector (list): A list of target output values.
   Returns:
      tuple: A tuple containing:
         - Error_Vector (list): A list of absolute errors for each corresponding element.
         - Error (float): The sum of all absolute errors.
   """
   Error_Vector = [abs(Actual_Outputs_Vector[i] - Target_Output_Vector[i]) for i in range(len(Target_Output_Vector))]
   Error = sum(Error_Vector)

   return Error_Vector, Error
      
def create_layer(numberOfNeurons, numberOfInputs):
    layer = []
    for i in range(numberOfNeurons):
        layer.append(Neuron(numberOfInputs,[]))
    return layer




In [None]:
hiddenLayer = create_layer(10, 28)
outputLayer = create_layer(4, 10)
learningRate = 0.1

exitCondition = False
counter = 0
while (counter < 10):
    counter += 1
    np.random.shuffle(X_train)
    batches = [                         [                            (X_train[bf * 69 + i], Y_train[bf * 69 + i]) for i in range(69)                        ] for bf in range(14)             ]

    number_of_batches = len(batches)

    for batch in batches: 
        error = 0
        for dataPoint in batch:
           singleError , hiddenLayerOutputs, outputLayerOutputs = train(dataPoint[0], dataPoint[1], hiddenLayer, outputLayer)
           error += singleError  
        # after finishing the batch
        # diff output * y/z  * z/w
        for i in range(len(outputLayer)): # back propagation for output layer
            diffOfError = - (dataPoint[1] - outputLayerOutputs[i])
            diffOfSigmoid = outputLayer[i].differentiationOfSigmoid()
            for j in range(len(outputLayer[i].weights)):
                diffOfZ = outputLayer[i].differentiationOfZ(f"w{j+1}")
                deltaW = diffOfError * diffOfSigmoid * diffOfZ
                outputLayer[i].weights[j] -= learningRate * deltaW

        for i in range(len(outputLayer)): # back propagation for hidden layer
            diffOfError = - (dataPoint[1] - outputLayerOutputs[i])
            diffOfSigmoid = outputLayer[i].differentiationOfSigmoid()
            for j in range(len(outputLayer[i].weights)):
                diffOfZ = outputLayer[i].differentiationOfZ(f"y{j+1}")
                for k in range(len(hiddenLayer)):
                    diffOf2ndSigmoid = hiddenLayer[k].differentiationOfSigmoid()
                    for l in range(len(hiddenLayer[k].weights)):
                        diffOfInput = hiddenLayer[k].differentiationOfZ(f"x{l+1}")
                        deltaW = diffOfError * diffOfSigmoid * diffOfZ  * diffOf2ndSigmoid * diffOfInput
                        hiddenLayer[k].weights[l] -= learningRate * deltaW
                

        
#     errVec, errAgg = test_AymanAndOmarNour((X_validate, Y_validate), bestGene[0])
#     print("Epoch total error: ", errAgg)
 
# print("===================================================")

# # code for testing
# errVec, errAgg = test_AymanAndOmarNour((X_test, Y_test), bestGene[0])
# test_std_dev = np.std(errVec)
# print(f"Test total error: {errAgg}, Accuracy: {100 - (errAgg/(len(Y_test)*4))*100}, Std Dev: {test_std_dev}")

# # code for validation
# errVec, errAgg = test_AymanAndOmarNour((X_validate, Y_validate), bestGene[0])
# validate_std_dev = np.std(errVec)
# print(f"Validate total error: {errAgg}, Accuracy: {100 - (errAgg/(len(Y_validate)*4))*100}, Std Dev: {validate_std_dev}")

# # code for training
# errVec, errAgg = test_AymanAndOmarNour((X_train, Y_train), bestGene[0])
# train_std_dev = np.std(errVec)
# print(f"Train total error: {errAgg}, Accuracy: {100 - (errAgg/(len(Y_train)*4))*100}, Std Dev: {train_std_dev}")
