In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline

#### Importing Libraries

In [2]:
#### Reading a data Set

titanic_data = pd.read_csv('titanic_full_data.csv')

### Checking columns name in dataset

titanic_data.columns


Index(['passenger_id', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest',
       'survived'],
      dtype='object')

In [3]:
titanic_data.head(2)

Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived
0,0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO",0.0
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON",1.0


In [4]:
### Getting  dummies colums of required columns

titanic_data = pd.get_dummies(titanic_data,columns=['pclass','embarked','sex'])


titanic_data


Unnamed: 0,passenger_id,name,age,sibsp,parch,ticket,fare,cabin,boat,body,home.dest,survived,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,sex_female,sex_male
0,0,"Allen, Miss. Elisabeth Walton",29.0000,0,0,24160,211.3375,B5,2,,"St Louis, MO",0.0,1,0,0,0,0,1,1,0
1,1,"Allison, Master. Hudson Trevor",0.9167,1,2,113781,151.5500,C22 C26,11,,"Montreal, PQ / Chesterville, ON",1.0,1,0,0,0,0,1,0,1
2,2,"Allison, Miss. Helen Loraine",2.0000,1,2,113781,151.5500,C22 C26,,,"Montreal, PQ / Chesterville, ON",0.0,1,0,0,0,0,1,1,0
3,3,"Allison, Mr. Hudson Joshua Creighton",30.0000,1,2,113781,151.5500,C22 C26,,135.0,"Montreal, PQ / Chesterville, ON",0.0,1,0,0,0,0,1,0,1
4,4,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0000,1,2,113781,151.5500,C22 C26,,,"Montreal, PQ / Chesterville, ON",0.0,1,0,0,0,0,1,1,0
5,5,"Anderson, Mr. Harry",48.0000,0,0,19952,26.5500,E12,3,,"New York, NY",1.0,1,0,0,0,0,1,0,1
6,6,"Andrews, Miss. Kornelia Theodosia",63.0000,1,0,13502,77.9583,D7,10,,"Hudson, NY",,1,0,0,0,0,1,1,0
7,7,"Andrews, Mr. Thomas Jr",39.0000,0,0,112050,0.0000,A36,,,"Belfast, NI",,1,0,0,0,0,1,0,1
8,8,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",53.0000,2,0,11769,51.4792,C101,D,,"Bayside, Queens, NY",1.0,1,0,0,0,0,1,1,0
9,9,"Artagaveytia, Mr. Ramon",71.0000,0,0,PC 17609,49.5042,,,22.0,"Montevideo, Uruguay",,1,0,0,1,0,0,0,1


In [5]:
### Dropping useless Columns

titanic_data.drop(['ticket','pclass_1','embarked_C','sex_female','name','passenger_id','home.dest','cabin','boat','body','parch'],axis=1,inplace=True)

titanic_data

Unnamed: 0,age,sibsp,fare,survived,pclass_2,pclass_3,embarked_Q,embarked_S,sex_male
0,29.0000,0,211.3375,0.0,0,0,0,1,0
1,0.9167,1,151.5500,1.0,0,0,0,1,1
2,2.0000,1,151.5500,0.0,0,0,0,1,0
3,30.0000,1,151.5500,0.0,0,0,0,1,1
4,25.0000,1,151.5500,0.0,0,0,0,1,0
5,48.0000,0,26.5500,1.0,0,0,0,1,1
6,63.0000,1,77.9583,,0,0,0,1,0
7,39.0000,0,0.0000,,0,0,0,1,1
8,53.0000,2,51.4792,1.0,0,0,0,1,0
9,71.0000,0,49.5042,,0,0,0,0,1


In [6]:
titanic_data.isnull().sum()

age           263
sibsp           0
fare            1
survived      458
pclass_2        0
pclass_3        0
embarked_Q      0
embarked_S      0
sex_male        0
dtype: int64

In [7]:
titanic_data=titanic_data.fillna(method='ffill')

In [8]:
titanic_data.tail(5)

Unnamed: 0,age,sibsp,fare,survived,pclass_2,pclass_3,embarked_Q,embarked_S,sex_male
1304,14.5,1,14.4542,0.0,0,1,0,0,0
1305,14.5,1,14.4542,0.0,0,1,0,0,0
1306,26.5,0,7.225,0.0,0,1,0,0,1
1307,27.0,0,7.225,0.0,0,1,0,0,1
1308,29.0,0,7.875,0.0,0,1,0,1,1


In [9]:
titanic_data.isnull().sum()

age           0
sibsp         0
fare          0
survived      0
pclass_2      0
pclass_3      0
embarked_Q    0
embarked_S    0
sex_male      0
dtype: int64

In [10]:
Y = titanic_data['survived']
X = titanic_data.drop('survived',axis=1)
X

Unnamed: 0,age,sibsp,fare,pclass_2,pclass_3,embarked_Q,embarked_S,sex_male
0,29.0000,0,211.3375,0,0,0,1,0
1,0.9167,1,151.5500,0,0,0,1,1
2,2.0000,1,151.5500,0,0,0,1,0
3,30.0000,1,151.5500,0,0,0,1,1
4,25.0000,1,151.5500,0,0,0,1,0
5,48.0000,0,26.5500,0,0,0,1,1
6,63.0000,1,77.9583,0,0,0,1,0
7,39.0000,0,0.0000,0,0,0,1,1
8,53.0000,2,51.4792,0,0,0,1,0
9,71.0000,0,49.5042,0,0,0,0,1


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size = .3)

logreg = LogisticRegression()

logreg.fit(X_train,Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(916, 8)
(916,)
(393, 8)
(393,)


In [27]:
predictions = logreg.predict(X_test)


predictions




array([0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1.,
       0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [14]:
from sklearn.metrics import confusion_matrix,r2_score

print(r2_score(predictions,Y_test))

confusion_matrix(Y_test,predictions)

-0.3288583863408103


array([[217,  28],
       [ 73,  75]], dtype=int64)

In [15]:
from sklearn.metrics import classification_report,accuracy_score
print(accuracy_score(Y_test,predictions))

classification_report(Y_test,predictions)

0.7430025445292621


'              precision    recall  f1-score   support\n\n         0.0       0.75      0.89      0.81       245\n         1.0       0.73      0.51      0.60       148\n\n   micro avg       0.74      0.74      0.74       393\n   macro avg       0.74      0.70      0.70       393\nweighted avg       0.74      0.74      0.73       393\n'