# 0.Load Data

## 0.1 Import Libraries

In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets

## 0.2 Load Dataset

In [26]:
df=sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# 1. Explore Data

## 1.1 Basic information about data

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


## 1.2 Check for Null Values

In [28]:
df.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


# 2. Data Cleaning

## 2.1 Remove unwanted columns

In [29]:
columns_to_drop=['deck','class','who','adult_male','embark_town','alone','alive']
df=df.drop(columns_to_drop,axis=1)
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


## 2.2 Deal with Null Values

In [30]:
df['age']=df.groupby(['sex','pclass'])['age'].transform(lambda x: x.fillna(x.mean()))

In [31]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.00,1,0,7.2500,S
1,1,1,female,38.00,1,0,71.2833,C
2,1,3,female,26.00,0,0,7.9250,S
3,1,1,female,35.00,1,0,53.1000,S
4,0,3,male,35.00,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.00,0,0,13.0000,S
887,1,1,female,19.00,0,0,30.0000,S
888,0,3,female,21.75,1,2,23.4500,S
889,1,1,male,26.00,0,0,30.0000,C


# 3. Logistic Regression

## 3.1 Define Logistic Regression Class

In [32]:
class LogisticRegression:
  # constructor
  # default learning rate is 0.01
  # default iterations are 1000
    def __init__(self, lr=0.01, n_iters=3000):
      self.lr = lr
      self.n_iters = n_iters
      self.weights = None
      self.bias = None

    def fit(self,X,y):
      n_samples,n_features=X.shape
      self.weights=np.random.rand(n_features) #weight is initialized to random number
      self.bias = 0 # bias initilized to 0

      for i in range(self.n_iters):

            # y_pred shape should be N, 1
            # predicting value such that y = m*X + b
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred=self.sigmoid(linear_model)

            # gradient of the cost function with respect to the weights
            dw = (1 / n_samples) * np.dot(X.T, y_pred - y)

            # gradient of the cost function with respect to the bias
            db = (1 / n_samples) * np.sum(y_pred - y)

            # decrease the weight by error cost
            # lr make sure that we take smaller steps
            self.weights = self.weights - self.lr * dw
            self.bias = self.bias - self.lr*db

      return self

    def sigmoid(self,z):
      return 1/(1+np.exp(-z))

    def predict(self,X):
      linear_model=np.dot(X,self.weights)+self.bias#linear regression
      y_pred=self.sigmoid(linear_model)#apply sigmoid function to linear regression
      y_pred=[1 if s>0.5 else 0 for s in y_pred]
      return y_pred

## 3.2 Prepare the Data for Modelling

Convert Categorical 'sex' column to numerical

In [33]:
df['sex']=[0 if s=='male' else 1 for s in df['sex'] ]

In [34]:
df= pd.get_dummies(df, columns=['embarked'], drop_first=True)

In [35]:
df['embarked_Q']=[0 if s==False else 1 for s in df['embarked_Q'] ]
df['embarked_S']=[0 if s==False else 1 for s in df['embarked_S'] ]

In [36]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_Q,embarked_S
0,0,3,0,22.00,1,0,7.2500,0,1
1,1,1,1,38.00,1,0,71.2833,0,0
2,1,3,1,26.00,0,0,7.9250,0,1
3,1,1,1,35.00,1,0,53.1000,0,1
4,0,3,0,35.00,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.00,0,0,13.0000,0,1
887,1,1,1,19.00,0,0,30.0000,0,1
888,0,3,1,21.75,1,2,23.4500,0,1
889,1,1,0,26.00,0,0,30.0000,0,0


Now data is ready for regression

## 3.3 Data Modelling

Define features and target

In [37]:
X=df.drop(['survived','parch'],axis=1)
y=df['survived']

split training and testing data

In [38]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=40)

In [39]:
model=LogisticRegression()
model.fit(X_train,y_train)

<__main__.LogisticRegression at 0x7f8ea10b5a90>

In [40]:
predicted=model.predict(X_test)

In [41]:
y_test.shape

(179,)

In [42]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.90      0.58      0.71       103
           1       0.62      0.91      0.73        76

    accuracy                           0.72       179
   macro avg       0.76      0.75      0.72       179
weighted avg       0.78      0.72      0.72       179



In [43]:
from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression()
logistic_regression .fit(X_train, y_train)
preds = logistic_regression .predict(X_test)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
from sklearn.metrics import classification_report
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.84      0.84      0.84       103
           1       0.79      0.78      0.78        76

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

