## Logistic Regression on Titanic data set

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=sns.load_dataset('titanic')
data.shape

(891, 15)

In [3]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
data.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### Null value Treatment

In [5]:
data.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
data.drop(['deck'],axis=1,inplace=True)
data['age'].fillna(data['age'].median(),inplace=True)
data['embark_town'].fillna(data['embark_town'].mode()[0],inplace=True)

In [7]:
data['embarked'].fillna(data['embarked'].mode()[0],inplace=True)

In [8]:
data.isna().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [9]:
data.shape

(891, 14)

### Outlier Treatment

In [10]:
col=['age','fare']
for i in col:
    x=np.array(data[i])
    qr1=np.quantile(x,0.25)
    qr3=np.quantile(x,0.75)
    iqr=qr3-qr1
    utv=qr3+(1.5*iqr)
    ltv=qr1-(1.5*iqr)
    lst=[]
    for p in x:
        if p>utv or p<ltv:
            lst.append(np.median(x))
        else:
            lst.append(p)
    data[i]=lst

In [11]:
data.shape

(891, 14)

In [12]:
data.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,28.476992,0.523008,0.381594,17.383622
std,0.486592,0.836071,9.793559,1.102743,0.806057,12.713016
min,0.0,1.0,3.0,0.0,0.0,0.0
25%,0.0,2.0,23.75,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,33.0,1.0,0.0,24.15
max,1.0,3.0,54.0,8.0,6.0,65.0


### Extracting Dependent and iNdependent variables

In [13]:
Y=data['survived']
data.drop(['survived'],axis=1,inplace=True)
X=data
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,female,38.0,1,0,14.4542,C,First,woman,False,Cherbourg,yes,False
2,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [14]:
num=[i for i in X.describe().columns]
cat=[i for i in X.columns if i not in num]

### Train test split

In [15]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.30,stratify=Y)

### Encoding

#### One hot encoding

In [16]:
cat

['sex',
 'embarked',
 'class',
 'who',
 'adult_male',
 'embark_town',
 'alive',
 'alone']

In [17]:
dummies_tr=pd.get_dummies(xtrain,drop_first=True)
dummies_te=pd.get_dummies(xtest,drop_first=True)
xtrain=pd.concat([xtrain,dummies_tr],axis=1)
xtest=pd.concat([xtest,dummies_te],axis=1)

In [18]:
xtrain.drop(cat,axis=1,inplace=True)
xtest.drop(cat,axis=1,inplace=True)

In [19]:
print(xtrain.shape,xtest.shape)
print(ytrain.shape,ytest.shape)

(623, 20) (268, 20)
(623,) (268,)


### Standard scaling

In [20]:
num

['pclass', 'age', 'sibsp', 'parch', 'fare']

In [21]:
X[num].head()

Unnamed: 0,pclass,age,sibsp,parch,fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,14.4542
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


In [22]:
num_cont=['age','fare']
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
sc.fit(xtrain[num_cont])
xtrain[num_cont]=sc.transform(xtrain[num_cont])

sc=StandardScaler()
sc.fit(xtest[num_cont])
xtest[num_cont]=sc.transform(xtest[num_cont])

### Logistic Regression-Base Model

In [23]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(xtrain,ytrain)
ytrain_pred=lr.predict(xtrain)
ytest_pred=lr.predict(xtest)

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytrain,ytrain_pred)

array([[384,   0],
       [  0, 239]], dtype=int64)

In [25]:
confusion_matrix(ytest,ytest_pred)

array([[165,   0],
       [  0, 103]], dtype=int64)

In [26]:
from sklearn.metrics import classification_report
print(classification_report(ytrain, ytrain_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       384
           1       1.00      1.00      1.00       239

    accuracy                           1.00       623
   macro avg       1.00      1.00      1.00       623
weighted avg       1.00      1.00      1.00       623



In [27]:
from sklearn.metrics import classification_report
print(classification_report(ytest, ytest_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       165
           1       1.00      1.00      1.00       103

    accuracy                           1.00       268
   macro avg       1.00      1.00      1.00       268
weighted avg       1.00      1.00      1.00       268

