In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
titanic_ds = pd.read_csv('train.csv')
titanic_ds

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0000,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.0500,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0000,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.0750,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5000,,S,1
664,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.7500,,Q,0
665,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,1
666,3,"Strandberg, Miss. Ida Sofia",female,22.0,0,0,7553,9.8375,,S,0


In [3]:
print(titanic_ds.columns)

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked', 'Survived'],
      dtype='object')


In [32]:
titanic_ds

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived,Embarked_Q,Embarked_S,Sex_male
0,2,29.0,1,0,26.0000,1,False,True,False
1,3,29.0,0,0,8.0500,0,False,True,True
2,2,39.0,0,0,26.0000,0,False,True,True
3,3,29.0,0,4,21.0750,0,False,True,False
4,3,25.0,0,0,7.0500,0,False,True,True
...,...,...,...,...,...,...,...,...,...
663,2,17.0,0,0,10.5000,1,False,True,False
664,3,29.0,0,0,7.7500,0,True,False,True
665,3,32.0,0,0,56.4958,1,False,True,True
666,3,22.0,0,0,9.8375,0,False,True,False


In [31]:
#We need to do some basic preprocessing of data like filling up absent data on basis of median or filling up data in String with 0 or 1s
titanic_ds['Age'].fillna(titanic_ds['Age'].median() , inplace = True)
titanic_ds['Embarked'].fillna(titanic_ds['Embarked'].mode()[0] , inplace = True)

#Dropping the cabin column as most of its values are Not available
titanic_ds = titanic_ds.drop(columns=['Cabin'] , inplace = True)

#Converting some of them into indicator/dummy variable which will be easier to process
titanic_ds = pd.get_dummies(titanic_ds , columns = ['Embarked', 'Sex'] , drop_first = True)

#Dropping useless columns
titanic_ds.drop(columns = ['Name' , 'Ticket'] , inplace = True)


In [4]:
titanic_ds
#We are done with preprocessing here

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0000,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.0500,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0000,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.0750,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5000,,S,1
664,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.7500,,Q,0
665,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,1
666,3,"Strandberg, Miss. Ida Sofia",female,22.0,0,0,7553,9.8375,,S,0


In [5]:
X = titanic_ds.drop(columns=['Survived'])
Y = titanic_ds['Survived']
X , Y

(     Pclass                                            Name     Sex   Age  \
 0         2   Weisz, Mrs. Leopold (Mathilde Francoise Pede)  female  29.0   
 1         3               Williams, Mr. Howard Hugh "Harry"    male   NaN   
 2         2  Morley, Mr. Henry Samuel ("Mr Henry Marshall")    male  39.0   
 3         3     Palsson, Mrs. Nils (Alma Cornelia Berglund)  female  29.0   
 4         3                          Sutehall, Mr. Henry Jr    male  25.0   
 ..      ...                                             ...     ...   ...   
 663       2                             Ilett, Miss. Bertha  female  17.0   
 664       3                        Morrow, Mr. Thomas Rowan    male   NaN   
 665       3                                   Bing, Mr. Lee    male  32.0   
 666       3                     Strandberg, Miss. Ida Sofia  female  22.0   
 667       3                  Murphy, Miss. Katherine "Kate"  female   NaN   
 
      SibSp  Parch           Ticket     Fare Cabin Embarked  


In [6]:
X_train , X_test , Y_train , Y_test = model_selection.train_test_split(X , Y , test_size = 0.2)
X_train.shape , X_test.shape

((534, 10), (134, 10))

In [38]:
classifier = LogisticRegression(max_iter = 1000)

In [39]:
classifier.fit(X_train , Y_train)

In [45]:
Y_pred = classifier.predict(X_test)
Y_pred

array([1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0], dtype=int64)

In [52]:
accuracy = accuracy_score(Y_test , Y_pred)
conf_matrix = confusion_matrix(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred)



(0.8283582089552238,
 array([[70, 11],
        [12, 41]], dtype=int64),
 '              precision    recall  f1-score   support\n\n           0       0.85      0.86      0.86        81\n           1       0.79      0.77      0.78        53\n\n    accuracy                           0.83       134\n   macro avg       0.82      0.82      0.82       134\nweighted avg       0.83      0.83      0.83       134\n')

In [53]:
print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.8283582089552238
Confusion Matrix:
[[70 11]
 [12 41]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.86      0.86        81
           1       0.79      0.77      0.78        53

    accuracy                           0.83       134
   macro avg       0.82      0.82      0.82       134
weighted avg       0.83      0.83      0.83       134

