## Task 1: Dataset Selection

Data Preprocessing:
- Use visualizations (histograms, scatter plots, bar graphs, etc.) to understand the
distribution of features and identify any potential patterns/dependencies or
outliers.
- Identify the data types of each feature (numeric, categorical, text, etc.). For
numeric data, show its characteristics like mean, median, standard deviation, etc.
- Identify and handle missing values (null values) in the data. This could involve
removing rows with missing values, fixing missing values with appropriate
strategies, etc.

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import random

### Loading  Data 

In [50]:
df = pd.read_csv("HCV-Egy-Data.csv")
df

Unnamed: 0,Age,Gender,BMI,Fever,Nausea/Vomting,Headache,Diarrhea,Fatigue & generalized bone ache,Jaundice,Epigastric pain,...,ALT 36,ALT 48,ALT after 24 w,RNA Base,RNA 4,RNA 12,RNA EOT,RNA EF,Baseline histological Grading,Baselinehistological staging
0,56,1,35,2,1,1,1,2,2,2,...,5,5,5,655330,634536,288194,5,5,13,2
1,46,1,29,1,2,2,1,2,2,1,...,57,123,44,40620,538635,637056,336804,31085,4,2
2,57,1,33,2,2,2,2,1,1,1,...,5,5,5,571148,661346,5,735945,558829,4,4
3,49,2,33,1,2,1,2,1,2,1,...,48,77,33,1041941,449939,585688,744463,582301,10,3
4,59,1,32,1,1,2,1,2,2,2,...,94,90,30,660410,738756,3731527,338946,242861,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1380,44,1,29,1,2,2,2,1,1,1,...,63,44,45,387795,55938,5,5,5,15,4
1381,55,1,34,1,2,2,1,1,1,1,...,97,64,41,481378,152961,393339,73574,236273,10,2
1382,42,1,26,2,2,1,1,1,2,1,...,87,39,24,612664,572756,806109,343719,160457,6,2
1383,52,1,29,2,1,1,2,2,2,1,...,48,81,43,139872,76161,515730,2460,696074,15,3


### Data Exploration

In [1]:
df.hist(figsize=(40,20))

NameError: name 'df' is not defined

In [None]:
plt.figure(figsize=(28, 28))  # Adjust the width and height as needed

sns.heatmap(df.corr(), cmap='Reds', annot=True, fmt='.2f')

In [None]:
df.isnull().sum()

In [None]:
num_features = 29
fig, axes = plt.subplots(nrows=num_features, ncols=1, figsize=(10, 5 * num_features))

# Loop through each feature to create a PDF plot
for i, column in enumerate(df):
    # Plotting the density
    df[column].plot(kind='density', ax=axes[i], color='blue', alpha=0.5, label='PDF')

    # Calculate mean, median, and mode
    mean = df[column].mean()
    median = df[column].median()
    mode = df[column].mode()[0]

    # Marking mean, median, and mode with horizontal lines
    axes[i].axvline(x=mean , color='red', linestyle='--', label=f'Mean: {mean:.2f}')  # Adjusted y-value for visibility
    axes[i].axvline(x=median, color='green', linestyle='--', label=f'Median: {median:.2f}')  # Adjusted y-value for visibility
    axes[i].axvline(x=mode , color='purple', linestyle='--', label=f'Mode: {mode:.2f}')  # Adjusted y-value for visibility

    # Adding titles and labels
    axes[i].set_title(f'Probability Density Function for {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid()

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
pd.set_option('display.max_rows', None)
df.dtypes

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.plot(kind='box', subplots=True, sharex=False, sharey=False, layout=(10,3), figsize=(15,25))

In [None]:
Q1 = np.percentile(df['ALT after 24 w'], 25)
Q3 = np.percentile(df['ALT after 24 w'], 75)
IQR = Q3 - Q1
df = df[(df['ALT after 24 w'] >= Q1 - 1.5 * IQR) & (df['ALT after 24 w'] <= Q3 + 1.5 * IQR)]

Q1 = np.percentile(df['RNA 12'], 25)
Q3 = np.percentile(df['RNA 12'], 75)
IQR = Q3 - Q1
df = df[(df['RNA 12'] >= Q1 - 1.5 * IQR) & (df['RNA 12'] <= Q3 + 1.5 * IQR)]

df.plot(kind='box', subplots=True, sharex=False, sharey=False, layout=(10,3), figsize=(15,25))

# Task 2: Update NN weights using Genetic Algorithm

### Shuffle and split data

In [51]:
shuffled_data = df.sample(frac=1)

X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values

X_train, m_test, Y_train, my_test = train_test_split(X, Y, test_size = 0.30, random_state = 0)

X_validate, X_test, Y_validate, Y_test = train_test_split(m_test, my_test, test_size = 1/3, random_state = 0)


### Build Neural Network

In [95]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

sigs = np.vectorize(sigmoid)

def targetToVector(target):
  return [1 if i+1 == target else 0 for i in range(4)]

def fitness(predictedVector, targetVector):
    return sum([abs(targetVector[i] - predictedVector[i]) for i in range(len(predictedVector))])


def randomPopulation(size=500):
   W1s = [[[random.uniform(-10, 10) for i in range(28)] for j in range(10)] for count in range(size)]
   W2s = [[[random.uniform(-10, 10) for i in range(10)] for j in range(4)] for count in range(size)]
   B1s = [[random.uniform(-5, 5) for i in range(10)] for count in range(size)]
   B2s = [[random.uniform(-5, 5) for i in range(4)] for count in range(size)]
   return W1s, W2s, B1s, B2s

def crossoverWeights(rate, population):
    tmp_pop = population.copy()
    crossed_pop = []

    def cross(g1, g2):
        cp = random.randrange(0, len(g1))
        tg1_1 = g1[:cp]
        tg1_2 = g1[cp:]
        tg2_1 = g2[:cp]
        tg2_2 = g2[cp:]
        return (tg1_1 + tg2_2, tg2_1 + tg1_2)
    
    for i in range(int(rate/2*len(population))):
        parentGene1 = tmp_pop[random.randrange(0, len(tmp_pop))]
        tmp_pop.remove(parentGene1)
        parentGene2 = tmp_pop[random.randrange(0, len(tmp_pop))]
        tmp_pop.remove(parentGene2)


        idx1, idx2 = random.randrange(0, len(parentGene1[0])), random.randrange(0, len(parentGene1[0]))
        parentWeightVector1 = parentGene1[0][idx1]
        parentWeightVector2 = parentGene2[0][idx2]                           
         
        new_w1_p1, new_w1_p2 = cross(parentWeightVector1 , parentWeightVector2)

        parentGene1[0][idx1] = new_w1_p1
        parentGene2[0][idx2] = new_w1_p2

        idx1, idx2 = random.randrange(0, len(parentGene1[1])), random.randrange(0, len(parentGene1[1]))

        parentWeightVector1 = parentGene1[1][idx1]
        parentWeightVector2 = parentGene2[1][idx2]                           
         
        new_w2_p1, new_w2_p2 = cross(parentWeightVector1 , parentWeightVector2)

        parentGene1[1][idx1] = new_w2_p1
        parentGene2[1][idx2] = new_w2_p2


        #print("crossed: ", (new_w1_p1, new_w2_p1, parentGene1[2], parentGene1[3]))
        crossed_pop.append(parentGene1)
        crossed_pop.append(parentGene2)
        #print(len(crossed_pop), len(tmp_pop), len(crossed_pop + tmp_pop))
    return crossed_pop + tmp_pop

def mutationWeights(rate, weightCount, population):
   newPop = population.copy()
   limit = rate*(len(population))
   for i, gene in enumerate(population):
      #print(gene)
      if (random.random() <= rate):
         if (limit > 0):
            limit -=1
            for j in range(weightCount):
              newG = gene
              idx = random.randrange(0, len(gene[0]))
              rndW = random.randrange(0, len(gene[0][idx]))
              newG[0][idx][rndW] = random.uniform(-10, 10)
              idx = random.randrange(0, len(gene[1]))
              rndW = random.randrange(0, len(gene[1][idx]))
              newG[1][idx][rndW] = random.uniform(-10, 10)
              
              newPop[i] = newG
         else:
            break 
         
   return newPop

def mutationBias(rate,  population):
   newPop = population.copy()
   limit = rate*(len(population))
   for i, gene in enumerate(population):
      if (random.random() <= rate):
         if (limit > 0):
            limit -=1
            newG = gene
            idx = random.randrange(0, len(gene[0]))
            newG[2][idx] = random.uniform(-5, 5)
            idx = random.randrange(0, len(gene[1]))
            newG[3][idx] = random.uniform(-5, 5)
         else:
            break 
         
   return newPop
   
def selection(rate, population, fitness):
   #print("pop in select", len(population))
   full = list(map(lambda y: y[0], sorted([(population[i], fitness[i]) for i in range(len(population))], key = lambda x: x[1])))
   #print("in select: ",len(full[:int(len(population)/2)] + (full[:int(len(population)/2)] if len(population) % 2 == 0 else full[int(len(population)/2):])))
   #print(len(full[:int(len(population)/2)]), len(full[:int(len(population)/2)] if len(population) % 2 == 0 else full[int(len(population)/2):]), len(full[:int(len(population)/2)] + (full[:int(len(population)/2)] if len(population) % 2 == 0 else full[:int(len(population)/2)+1])))
   return full[:int(len(population)/2)] + (full[:int(len(population)/2)] if len(population) % 2 == 0 else full[:int(len(population)/2)+1])

def genPopulationTuples(W1s, W2s, B1s, B2s):
   tuplePopulation = []

   for w1 in W1s:
      for w2 in W2s:
         for b1 in B1s:
            for b2 in B2s:  
               tuplePopulation.append((w1, w2, b1, b2))

   return tuplePopulation

def forwardPass(population, dataPoint, target, fitness_results):
   idx = 0

   tuplePopulation = []

   for w1, w2, b1, b2 in population :  
      midResult = np.dot(w1, dataPoint)
      actv_midResult = sigs(midResult + b1)
               
      finalResult = np.dot(w2, actv_midResult)
      actv_finalResult = sigs(finalResult + b2)
      #print(actv_finalResult)
      fitness_results[idx] += fitness(actv_finalResult, targetToVector(target))
      idx+=1

def test_AymanAndOmarNour(labeled_input_matrix, weights_matrix):
   idx = 0

   x = labeled_input_matrix[0]
   y = labeled_input_matrix[1]

   Error_Vector = [0] * len(y)
   Error = 0

   for i in range(len(y)):
      #print(len(lblEd))
      yV = targetToVector(y[i])
      
      midResult = np.dot(weights_matrix[0], x[i])
      actv_midResult = sigs(midResult + weights_matrix[2])
               
      finalResult = np.dot(weights_matrix[1], actv_midResult)
      actv_finalResult = sigs(finalResult + weights_matrix[3])
      Error_Vector[idx] = fitness(actv_finalResult, yV)
      Error += fitness(actv_finalResult, yV)
      idx += 1 

   return Error_Vector, Error

def error_AymanAndOmarNour(Actual_Outputs_Vector, Target_Output_Vector):
   Error_Vector = [abs(Actual_Outputs_Vector[i] - Target_Output_Vector[i]) for i in range(len(Target_Output_Vector))]
   Error = sum(Error_Vector)

   return Error_Vector, Error





In [196]:
W1s, W2s, B1s, B2s = randomPopulation(2)

tuplePopulation = genPopulationTuples(W1s, W2s, B1s, B2s)

counterDP = 0

exitCondition = False
counter = 0
while (counter < 1):
    counter += 1
    np.random.shuffle(X_train)
    batches = [[(X_train[bf * 69 + i], Y_train[bf * 69 + i]) for i in range(69) ] for bf in range(14)]

    number_of_batches = len(batches)

    for batch in batches:
        
        resultsW2 = []
        
        fitnessResults = [0] * (len(W1s) * len(W2s) * len(B1s) * len(B2s)) 

        for dataPoint in batch:
           forwardPass(tuplePopulation, dataPoint[0], dataPoint[1], fitnessResults)
           #print(counterDP)
           counterDP+=1

        # code for updating weights and biases
        fitnessResults = list(map(lambda x: x/69, fitnessResults))

        #print("tp>>>>", len(tuplePopulation))

        newPopulation = selection(0.5, tuplePopulation, fitnessResults)
        #print("pop after select>>>>", len(newPopulation))
        #print("np>>>>", len(newPopulation))
        newPopulation = crossoverWeights(0.65, newPopulation)
        #print("pop after crossOver>>>>", len(newPopulation))
        newPopulation = mutationWeights(0.5, 2, newPopulation)
        #print("pop after mutateWeights>>>>", len(newPopulation))
        newPopulation = mutationBias(0.3, newPopulation)

        bestGene = sorted([(tuplePopulation[i], fitnessResults[i]) for i in range(len(tuplePopulation))], key = lambda x: x[1])[0]
        #print("fitness: ", bestGene[1])

        #print("pop after batch>>>>", len(newPopulation))

        tuplePopulation = newPopulation

    bestGene = sorted([(tuplePopulation[i], fitnessResults[i]) for i in range(len(tuplePopulation))], key = lambda x: x[1])[0]

    #print(bestGene)
    
    # code for validation
    errVec, errAgg = test_AymanAndOmarNour((X_validate, Y_validate), bestGene[0])
    print("Epoch total error: ", errAgg)
 
print("===================================================")

# code for testing
errVec, errAgg = test_AymanAndOmarNour((X_test, Y_test), bestGene[0])
test_std_dev = np.std(errVec)
print(f"Test total error: {errAgg}, Accuracy: {100 - (errAgg/(len(Y_test)*4))*100}, Std Dev: {test_std_dev}")

# code for validation
errVec, errAgg = test_AymanAndOmarNour((X_validate, Y_validate), bestGene[0])
validate_std_dev = np.std(errVec)
print(f"Validate total error: {errAgg}, Accuracy: {100 - (errAgg/(len(Y_validate)*4))*100}, Std Dev: {validate_std_dev}")

# code for training
errVec, errAgg = test_AymanAndOmarNour((X_train, Y_train), bestGene[0])
train_std_dev = np.std(errVec)
print(f"Train total error: {errAgg}, Accuracy: {100 - (errAgg/(len(Y_train)*4))*100}, Std Dev: {train_std_dev}")


  return 1 / (1 + np.exp(-x))


Epoch total error:  295.1010894004438
Test total error: 149.88494244444323, Accuracy: 73.0422765387692, Std Dev: 0.1554426097317241
Validate total error: 295.1010894004438, Accuracy: 73.36632767144009, Std Dev: 0.1625503083866556
Train total error: 1026.9171335292963, Accuracy: 73.50574990894489, Std Dev: 0.16187712014278757
