# Prepare Data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math

In [3]:
titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [4]:
titanic.dropna(inplace=True)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182 entries, 1 to 889
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     182 non-null    int64   
 1   pclass       182 non-null    int64   
 2   sex          182 non-null    object  
 3   age          182 non-null    float64 
 4   sibsp        182 non-null    int64   
 5   parch        182 non-null    int64   
 6   fare         182 non-null    float64 
 7   embarked     182 non-null    object  
 8   class        182 non-null    category
 9   who          182 non-null    object  
 10  adult_male   182 non-null    bool    
 11  deck         182 non-null    category
 12  embark_town  182 non-null    object  
 13  alive        182 non-null    object  
 14  alone        182 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 18.2+ KB


In [6]:
titanic.drop(['deck', 'embark_town', 'alive','who','embarked','sex','class'],inplace = True, axis = 1)
titanic

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
1,1,1,38.0,1,0,71.2833,False,False
3,1,1,35.0,1,0,53.1000,False,False
6,0,1,54.0,0,0,51.8625,True,True
10,1,3,4.0,1,1,16.7000,False,False
11,1,1,58.0,0,0,26.5500,False,True
...,...,...,...,...,...,...,...,...
871,1,1,47.0,1,1,52.5542,False,False
872,0,1,33.0,0,0,5.0000,True,True
879,1,1,56.0,0,1,83.1583,False,False
887,1,1,19.0,0,0,30.0000,False,True


<h1 style="color: #FF0000;"> Hence forth y = titanic['survived'] is a global variable 

In [7]:
y = titanic['survived']

# Define Linear Regression

## Define Normalize

In [8]:
def normalize(X):
    for i in X.columns:
        if X[i].dtype == 'bool':
            X[i] = X[i]
        else:
            X[i] = (X[i] - X[i].mean())/X[i].std()
    return X
# normalize(titanic)

## Define Gradient Descend

In [9]:
def gradient_descend(X, y, learning_rate = 0.01, n_iter = 1000):
    X = normalize(X)
    X = np.c_[np.ones(X.shape[0]), X]
    theta = np.zeros(X.shape[1])
    m = X.shape[0]
    
    for i in range(n_iter):
        h = X @ theta
        loss = (h - y)
        gradient = X.T @ loss / m
        theta = theta - (learning_rate * gradient)
    return theta

### Testing if the regression works

In [10]:
# x = (titanic.drop('survived', axis = 1))
# y = (titanic['survived'])
# theta = gradient_descend(X=x, y=y, learning_rate = 0.01, n_iter = 1000)
# theta



# Define Fitness Function
    Here we are using Akaike information criterion which is given by the formula
    
AIC = 2k - 2ln(L^)

where k = number of estimated parameters 
      L^ = maximised value of likelihood function
 

In [11]:
def rss(X,y,theta):
    X = normalize(X)
    X = np.c_[np.ones(X.shape[0]), X]
    residual_sum_squares = sum(((X @ theta) - y)**2)
    return residual_sum_squares

def calculate_aic(X,y,theta):
    n = len(y)
    aic = 2 * X.shape[1] + n * np.log(rss(X,y,theta) / n)
    return aic    

In [12]:
# x = (titanic.drop('survived', axis = 1))
# y = (titanic['survived'])
# theta = gradient_descend(X=x, y=y, learning_rate = 0.01, n_iter = 1000)
# calculate_aic(x,y,theta)

# Generate Parent Population

In [14]:
def parent_pops(data,size,y):
    papa_pops = []
    df_temp = data.drop(y, axis = 1)
    for i in range(size):
        feature_arrs = np.random.choice([True,False],size = df_temp.shape[1])
        papa_pops.append(df_temp.iloc[:, np.where(feature_arrs)[0]])
    return papa_pops

parent_pops(titanic,10,'survived')

[     parch
 1        0
 3        0
 6        0
 10       1
 11       0
 ..     ...
 871      1
 872      0
 879      1
 887      0
 889      0
 
 [182 rows x 1 columns],
       age  parch  alone
 1    38.0      0  False
 3    35.0      0  False
 6    54.0      0   True
 10    4.0      1  False
 11   58.0      0   True
 ..    ...    ...    ...
 871  47.0      1  False
 872  33.0      0   True
 879  56.0      1  False
 887  19.0      0   True
 889  26.0      0   True
 
 [182 rows x 3 columns],
      sibsp  parch     fare  adult_male  alone
 1        1      0  71.2833       False  False
 3        1      0  53.1000       False  False
 6        0      0  51.8625        True   True
 10       1      1  16.7000       False  False
 11       0      0  26.5500       False   True
 ..     ...    ...      ...         ...    ...
 871      1      1  52.5542       False  False
 872      0      0   5.0000        True   True
 879      0      1  83.1583       False  False
 887      0      0  30.0000     

## Calculate fitness score for parents

In [15]:
def parent_score(keys):
    pop_score = {}
    temp_var = 0
    for key in keys:
        theta1 = gradient_descend(key, y, learning_rate = 0.01, n_iter = 1000)
        aic = -1*calculate_aic(key,y,theta1)
        pop_score[temp_var] = aic
        temp_var += 1
    return pop_score

In [16]:
# parent_score(parent_pops(titanic,10,'survived'),titanic['survived'])

## Select fittest n parents

In [17]:
# population = parent_pops(titanic,100,'survived')
def fittest_parents(population,lamda):
    parent_scores = parent_score(population)
    fittest_parent = []
    sorted_parents = sorted(parent_scores.items(), key = lambda x:x[1], reverse = True)
    top_lamda_parents_indices = sorted_parents[:lamda]
    top_lamda_parents = [item[0] for item in top_lamda_parents_indices]
    
    for i in top_lamda_parents:
        fittest_parent.append(population[i])
    return fittest_parent

# Define crossover
    we choose uniform-crossover initially

In [17]:
def uniform_crossover(parent1, parent2):
    
    pass

# Define Mutation
    we choose random bit-flip mutation initially

In [18]:
def rbf_mutation(parent1, parent2):
    pass


# Define Genetic Algorithm
    we choose to do it according to the ( µ, λ ) Evolution Strategy 


In [19]:
def gen_alg(data,no_parents_selected,pop_size,target_variable,iterations): 
    population = parent_pops(data,pop_size,target_variable)
    
    for z in range(iterations):
        parents = fittest_parents(population,no_parents_selected)
        new_gen = []
        for i in range(math.comb(len(parents),2)):
            for j in range(len(parents)):
                parent1 = parents[j]
                for k in range(i+1,len(parents)):            
                    parent2 = parents[k]
                
                    if np.random.normal(loc = .5 , scale = .25 , size = 1) < 0.5:
                        new_gen.append(uniform_crossover(parent1,parent2))
                    else:
                        new_gen.append(rbf_mutation(parent1,parent2))
        population = new_gen

    return fittest_parents(population,1)