In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
train_data=pd.read_csv("LOG_TRAIN_MANAS.csv")
test_data=pd.read_csv("LOG_TEST_MANAS.csv")

In [3]:
train_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


## Let's have a look at the data first

In [4]:
train_data.describe()

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,577.0,577.0,577.0,577.0,577.0,577.0
mean,0.39688,2.310225,29.336222,0.54766,0.376083,31.942309
std,0.489675,0.834863,14.208585,1.114098,0.778832,46.816156
min,0.0,1.0,0.75,0.0,0.0,0.0
25%,0.0,2.0,21.0,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.5
75%,1.0,3.0,37.0,1.0,0.0,30.5
max,1.0,3.0,71.0,8.0,5.0,512.3292


### From the data given, we know that "Survived" class is the thing that we need to predict but we also know, that not all the other columns would be contributing to that <br>

### For instance, Although "Pclass(Passenger class)" Plays an important role in deciding the survival rate, "Fare" Doesn't. Also, "Sex" also plays an important role but it is in a different dtype

In [5]:
train_data["Sex"].dtype #Shows Object and is in str format, we need to convert it into integer

dtype('O')

In [6]:
#Simple enough, we can represent males as 0, females as 1, I am not sure if representing them by any 
#other number makes a difference but we will see.
train_data["Sex"]=(train_data["Sex"].replace("female",1)).replace("male",0)

#### So everything except "Fare" ,"Survived" and "Name" seems appropriate for our training data


In [8]:
def Normalize(data):
    return (data-data.mean())/data.std()

In [78]:
def get_data(train,test):
    X_train=Normalize(train.drop(["Survived","Fare","Name"],axis=1))
    Y_train=Normalize(train["Survived"])
    X_test=Normalize(test.drop(["Survived","Fare","Name"],axis=1))
    Y_test=Normalize(test["Survived"])
    
    #Adding a column for the Bias term
    X_train.insert(0,"Ones",np.ones(len(X_train)))
    X_test.insert(0,"Ones",np.ones(len(X_test)))
    return X_train,Y_train,X_test,Y_test

## The reason Normalization is preffered in deep learning, is because with Normalized data, the derivatives converge faster.

In [10]:
#Activation function
def sigmoid(z):
    return 1/(1+np.exp(-z))
#So what the sigmoid function does is take whatever the output was and map in the range(0,1), which 
#very crucial in case of Logistic Regression, as it allows our final result be interpreted as a 
#probability

def log_loss(y_pred,y_true):
    return -np.mean(y_true*np.log(y_pred)+(1-y_true)*np.log(1-y_pred))

In [80]:
def Logistic_Reg(X,Y,epochs,lr):
    num_samples,num_features=X.shape
    weights=np.random.randn(num_features)
    for epoch in range(epochs):
        z=X@weights
        act=sigmoid(z)
        loss=log_loss(act,Y_train)
        gradients=((act-Y_train)@X) * (1.0/num_samples)
        weights-=lr*gradients
    return weights    

def accuracy(Y_pred,Y_true):
    return sum(Y_pred==Y_true)/len(Y_true)

def train(X,Y,epochs,lr):
    weights=Logistic_Reg(X,Y,epochs,lr)
    return weights

def get_preds(X,weights):
    preds=sigmoid(X@weights)
    preds[preds>=0.5]=1
    preds[preds<0.5]=0
    return preds

## First Iteration, we are training with "female"=1 and "male"=0

In [83]:
tr1=train_data.copy()
ts1=test_data.copy()
tr1["Sex"]=(tr1["Sex"].replace("female",1)).replace("male",0)
ts1["Sex"]=(ts1["Sex"].replace("female",1)).replace("male",0)

In [85]:
X_train,Y_train,X_test,Y_test=get_data(tr1,ts1)

In [90]:
w=train(X_train,Y_train,1000,0.001)
accuracy(get_preds(X_train,w),tr1["Survived"])

0.7019064124783362

In [92]:
w=train(X_train,Y_train,1000,0.1)
accuracy(get_preds(X_train,w),tr1["Survived"])
##Makes sense given the lr is higher

0.7972270363951474

In [94]:
accuracy(get_preds(X_test,w),ts1["Survived"])

0.8064516129032258

## Let's test some variations of the training data, for example let's assign some other numbers to the male and female data
### female=0 , male=1

In [95]:
tr2=train_data.copy()
ts2=test_data.copy()
tr2["Sex"]=(tr2["Sex"].replace("female",0)).replace("male",1)
ts2["Sex"]=(ts2["Sex"].replace("female",0)).replace("male",1)

In [96]:
X_train,Y_train,X_test,Y_test=get_data(tr2,ts2)

In [98]:
w=train(X_train,Y_train,1000,0.1)
accuracy(get_preds(X_train,w),tr2["Survived"])

0.7972270363951474

In [99]:
accuracy(get_preds(X_test,w),ts2["Survived"])

0.8064516129032258

## Didn't change really what about large difference in the values that we assign

In [102]:
tr3=train_data.copy()
ts3=train_data.copy()
tr3["Sex"]=(tr3["Sex"].replace("female",100)).replace("male",50)
ts3["Sex"]=(ts3["Sex"].replace("female",100)).replace("male",50)

In [103]:
X_train,Y_train,X_test,Y_test=get_data(tr3,ts3)

In [113]:
w=train(X_train,Y_train,1000,0.1)
accuracy(get_preds(X_train,w),tr3["Survived"])

0.7972270363951474

In [115]:
accuracy(get_preds(X_test,w),ts3["Survived"])# Not a big change really

0.7972270363951474

## A lil googling tells me I can instead use something known as One-hot encoding which would work something like this:
### There are two Genders here Female and Male,
### So we can represent them with an array of numbers and If a data point says Female, the array looks like [1,0] else it looks like [0,1]. Simple

In [116]:
genders=train_data["Sex"].unique()

In [117]:
(train_data["Sex"][0]==genders).astype(int) #We could bundle this up in a function and map it to the "Sex" Column

array([1, 0])

In [118]:
def tfms(x):
    return (x==genders).astype(int)
list(map(tfms,train_data["Sex"])) ##Hurray Works!!

[array([1, 0]),
 array([0, 1]),
 array([0, 1]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([0, 1]),
 array([0, 1]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([1, 0]),
 array([0, 1]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([1, 0]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([0, 1]),
 array([0, 1]),
 array([0, 1]),
 array([0, 1]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([1, 0]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([1, 0]),
 array([0, 1]),
 array([1, 0]),
 array([1, 0]),
 array([0, 1]),
 array([1, 0]),
 array([

In [130]:
tr4=train_data.copy()
ts4=train_data.copy()
tr4=pd.get_dummies(tr4,columns=["Sex"])
ts4=pd.get_dummies(ts4,columns=["Sex"])

In [131]:
X_train,Y_train,X_test,Y_test=get_data(tr4,ts4)

In [132]:
X_train.head() #well guess Normalization doesn't work on arrays(Obviously)

Unnamed: 0,Ones,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Sex_female,Sex_male
0,1.0,0.826213,-0.516323,0.406014,-0.482881,-0.775715,0.775715
1,1.0,-1.56939,0.609757,0.406014,-0.482881,1.2869,-1.2869
2,1.0,0.826213,-0.234803,-0.491573,-0.482881,1.2869,-1.2869
3,1.0,-1.56939,0.398617,0.406014,-0.482881,1.2869,-1.2869
4,1.0,0.826213,0.398617,-0.491573,-0.482881,-0.775715,0.775715


In [135]:
w=train(X_train,Y_train,1000,0.1)
accuracy(get_preds(X_train,w),tr4["Survived"])
## Slight Slight improvement

0.8058925476603119

In [136]:
accuracy(get_preds(X_test,w),ts4["Survived"])# Not a big change really

0.8058925476603119