# Prepare Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [2]:
titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [3]:
titanic.dropna(inplace=True)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 182 entries, 1 to 889
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     182 non-null    int64   
 1   pclass       182 non-null    int64   
 2   sex          182 non-null    object  
 3   age          182 non-null    float64 
 4   sibsp        182 non-null    int64   
 5   parch        182 non-null    int64   
 6   fare         182 non-null    float64 
 7   embarked     182 non-null    object  
 8   class        182 non-null    category
 9   who          182 non-null    object  
 10  adult_male   182 non-null    bool    
 11  deck         182 non-null    category
 12  embark_town  182 non-null    object  
 13  alive        182 non-null    object  
 14  alone        182 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 18.2+ KB


In [5]:
titanic.drop(['deck', 'embark_town', 'alive','who','embarked','sex','class'],inplace = True, axis = 1)
titanic

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
1,1,1,38.0,1,0,71.2833,False,False
3,1,1,35.0,1,0,53.1000,False,False
6,0,1,54.0,0,0,51.8625,True,True
10,1,3,4.0,1,1,16.7000,False,False
11,1,1,58.0,0,0,26.5500,False,True
...,...,...,...,...,...,...,...,...
871,1,1,47.0,1,1,52.5542,False,False
872,0,1,33.0,0,0,5.0000,True,True
879,1,1,56.0,0,1,83.1583,False,False
887,1,1,19.0,0,0,30.0000,False,True


# Define Linear Regression

## Define Normalize

In [6]:
def normalize(X):
    for i in X.columns:
        if X[i].dtype == 'bool':
            X[i] = X[i]
        else:
            X[i] = (X[i] - X[i].mean())/X[i].std()
    return X
# normalize(titanic)

## Define Gradient Descend

In [7]:
def gradient_descend(X, y, learning_rate = 0.01, n_iter = 1000):
    X = normalize(X)
    X = np.c_[np.ones(X.shape[0]), X]
    theta = np.zeros(X.shape[1])
    m = X.shape[0]
    
    for i in range(n_iter):
        h = X @ theta
        loss = (h - y)
        gradient = X.T @ loss / m
        theta = theta - (learning_rate * gradient)
    return theta



## Calculate Residual Sum of Squares

In [8]:
def rss(X,y,theta):
    X = normalize(X)
    X = np.c_[np.ones(X.shape[0]), X]
    rss = sum(((X @ theta) - y)**2)
    return rss


### Testing if the regression works

In [9]:
x = (titanic.drop('survived', axis = 1))
y = (titanic['survived'])
theta = gradient_descend(X=x, y=y, learning_rate = 0.01, n_iter = 1000)
theta
rss(x,y,theta)


26.21043043776363

# Define Fitness Function
    Here we are using Akaike information criterion which is given by the formula
    
AIC = 2k - 2ln(L^)

where k = number of estimated parameters 
      L^ = maximised value of likelyhood function
 

In [10]:
def calculate_aic(X,y,theta):
    n = len(y)
    aic = 2 * X.shape[1] + n * np.log(rss(X,y,theta) / n)
    return aic    

In [11]:
x = (titanic.drop('survived', axis = 1))
y = (titanic['survived'])
theta = gradient_descend(X=x, y=y, learning_rate = 0.01, n_iter = 1000)
calculate_aic(x,y,theta)

-338.68856299444826

# Generate Parent Population

In [13]:
def parent_pops(data,size,y):
    papa_pops = []
    df_temp = data.drop(y, axis = 1)
    for i in range(size):
        feature_arrs = np.random.choice([True,False],size = df_temp.shape[1])
        papa_pops.append(df_temp.iloc[:, np.where(feature_arrs)[0]])  
    return papa_pops

parent_pops(titanic,10,'survived')

[     pclass  sibsp  parch     fare  adult_male
 1         1      1      0  71.2833       False
 3         1      1      0  53.1000       False
 6         1      0      0  51.8625        True
 10        3      1      1  16.7000       False
 11        1      0      0  26.5500       False
 ..      ...    ...    ...      ...         ...
 871       1      1      1  52.5542       False
 872       1      0      0   5.0000        True
 879       1      0      1  83.1583       False
 887       1      0      0  30.0000       False
 889       1      0      0  30.0000        True
 
 [182 rows x 5 columns],
      sibsp  parch  adult_male
 1        1      0       False
 3        1      0       False
 6        0      0        True
 10       1      1       False
 11       0      0       False
 ..     ...    ...         ...
 871      1      1       False
 872      0      0        True
 879      0      1       False
 887      0      0       False
 889      0      0        True
 
 [182 rows x 3 columns]

# Define crossover