# Classification Benchmark Model

In [1]:
import pandas as pd
import numpy as np

In [2]:
titanic = pd.read_csv('../titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Shuffling and Creating Train and Test Set

In [6]:
shuffled_indices = np.random.permutation(titanic.shape[0])

# identify division
test_size = int(titanic.shape[0] * 0.2)

# 80% percent for train and 20% for test
train = titanic.iloc[shuffled_indices[:4 * test_size]]
test = titanic.iloc[shuffled_indices[4 * test_size:]]

titanic.shape, train.shape, test.shape

((891, 12), (712, 12), (179, 12))

### Accuracy Using Mode

In [7]:
survived_mode = train.Survived.mode()[0]
survived_mode

0

In [8]:
test.insert(0, 'Survived_Mode', survived_mode)
test.head()

Unnamed: 0,Survived_Mode,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
473,0,474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23.0,0,0,SC/AH Basle 541,13.7917,D,C
318,0,319,1,1,"Wick, Miss. Mary Natalie",female,31.0,0,2,36928,164.8667,C7,S
247,0,248,1,2,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5,,S
839,0,840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C
731,0,732,0,3,"Hassan, Mr. Houssein G N",male,11.0,0,0,2699,18.7875,,C


In [9]:
from sklearn.metrics import accuracy_score


survived_mode_accuracy = accuracy_score(test.Survived, test.Survived_Mode)
survived_mode_accuracy

0.6089385474860335

### Look For Correlations

In [10]:
corr_matrix = titanic.corr()
corr_matrix['Survived'].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

### Mode Based on Gender

In [11]:
gender_mode_crosstab = pd.crosstab(train.Survived, train.Sex)
gender_mode_crosstab

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,64,376
1,186,86


In [12]:
mode_by_gender = train.groupby('Sex')['Survived'].apply(lambda x: x.mode()[0])
mode_by_gender

Sex
female    1
male      0
Name: Survived, dtype: int64

In [19]:
pd.options.mode.chained_assignment = None # SettingWithCopy warning

test['Gender_Mode'] = test['Sex'].map(mode_by_gender)
test[['Survived', 'Sex', 'Gender_Mode']].head(7)

Unnamed: 0,Survived,Sex,Gender_Mode
473,1,female,1
318,1,female,1
247,1,female,1
839,1,male,0
731,0,male,0
629,0,male,0
66,1,female,1


In [20]:
gender_accuracy = accuracy_score(test.Survived, test.Gender_Mode)
gender_accuracy

0.776536312849162