# Fundamentals of Supervised Learning Algorithms

# Chapter 1 - Fundamentals

In [1]:
import pandas as pd
import numpy as np

## Introduction

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Cabin,Embarked,Fare,Pclass,Ticket,Age,Name,Parch,Sex,SibSp,Survived
0,0,,S,7.25,3,A/5 21171,22.0,"Braund, Mr. Owen Harris",0,male,1,0.0
1,1,C85,C,71.2833,1,PC 17599,38.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,female,1,1.0
2,2,,S,7.925,3,STON/O2. 3101282,26.0,"Heikkinen, Miss. Laina",0,female,0,1.0
3,3,C123,S,53.1,1,113803,35.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,female,1,1.0
4,4,,S,8.05,3,373450,35.0,"Allen, Mr. William Henry",0,male,0,0.0


### Description for the data

<b>survival:</b> This tells us whether a given person survived (0 = No, 1 = Yes).<br>

<b>pclass:</b> This is a proxy for socio-economic status, where first class is upper, second class is middle, and third class is lower status.<br>

<b>sex:</b> This tells us whether a given person is male or female.<br>

<b>age:</b> This is a fractional value if less than 1; for example, 0.25 is 3 months. If the age is estimated, it is in the form of xx.5.<br>

<b>sibsp:</b> A sibling is defined as a brother, sister, stepbrother, or stepsister, and a spouse is a husband or wife. <br>

<b>parch:</b> A parent is a mother or father, while a child is a daughter, son, stepdaughter, or stepson. Children that traveled only with a nanny did not travel with a parent. Thus, 0 was assigned for this field.<br>

<b>ticket:</b> This gives the person's ticket number.<br>

<b>fare:</b> This is the passenger's fare.<br>

<b>cabin:</b> This tells us the passenger's cabin number.<br>

<b>embarked:</b> The point of embarkation is the location where the passenger boarded the ship.

## Exploring data

In [4]:
# Selecting one column
df['Age'] # or df.Age

0       22.0
1       38.0
2       26.0
3       35.0
4       35.0
        ... 
1304     NaN
1305    39.0
1306    38.5
1307     NaN
1308     NaN
Name: Age, Length: 1309, dtype: float64

In [5]:
# Selecting multiple columns
df[['Name', 'Parch', 'Sex']]

Unnamed: 0,Name,Parch,Sex
0,"Braund, Mr. Owen Harris",0,male
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,female
2,"Heikkinen, Miss. Laina",0,female
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,female
4,"Allen, Mr. William Henry",0,male
...,...,...,...
1304,"Spector, Mr. Woolf",0,male
1305,"Oliva y Ocana, Dona. Fermina",0,female
1306,"Saether, Mr. Simon Sivertsen",0,male
1307,"Ware, Mr. Frederick",0,male


In [6]:
# Selecting one rows
df.iloc[0]

Unnamed: 0                          0
Cabin                             NaN
Embarked                            S
Fare                             7.25
Pclass                              3
Ticket                      A/5 21171
Age                                22
Name          Braund, Mr. Owen Harris
Parch                               0
Sex                              male
SibSp                               1
Survived                            0
Name: 0, dtype: object

In [7]:
# Selecting multiple rows
df.iloc[[0,1,2]]

Unnamed: 0.1,Unnamed: 0,Cabin,Embarked,Fare,Pclass,Ticket,Age,Name,Parch,Sex,SibSp,Survived
0,0,,S,7.25,3,A/5 21171,22.0,"Braund, Mr. Owen Harris",0,male,1,0.0
1,1,C85,C,71.2833,1,PC 17599,38.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,female,1,1.0
2,2,,S,7.925,3,STON/O2. 3101282,26.0,"Heikkinen, Miss. Laina",0,female,0,1.0


In [8]:
# Getting the column names
columns = df.columns
columns

Index(['Unnamed: 0', 'Cabin', 'Embarked', 'Fare', 'Pclass', 'Ticket', 'Age',
       'Name', 'Parch', 'Sex', 'SibSp', 'Survived'],
      dtype='object')

In [9]:
# Selecting columns
df[columns[1:4]]

Unnamed: 0,Cabin,Embarked,Fare
0,,S,7.2500
1,C85,C,71.2833
2,,S,7.9250
3,C123,S,53.1000
4,,S,8.0500
...,...,...,...
1304,,S,8.0500
1305,C105,C,108.9000
1306,,S,7.2500
1307,,S,8.0500


In [10]:
# Getting the number of rows
len(df)

1309

In [11]:
#Get the value of the Fare column in row 2 (row-centric method)
df.iloc[2]['Fare'] #or df.iloc[2].Fare

7.925

In [12]:
#Get the value of the Fare column in row 2 (column-centric method)
df['Fare'][2] #or df.Fare[2]

7.925

In [13]:
# Create a list of names and ages for those under the age of 21
child_passengers = df[df.Age < 21][['Name', 'Age']]
child_passengers.head()

Unnamed: 0,Name,Age
7,"Palsson, Master. Gosta Leonard",2.0
9,"Nasser, Mrs. Nicholas (Adele Achem)",14.0
10,"Sandstrom, Miss. Marguerite Rut",4.0
12,"Saundercock, Mr. William Henry",20.0
14,"Vestrom, Miss. Hulda Amanda Adolfina",14.0


In [14]:
# Count how many child passengers there were
len(child_passengers)

249

In [15]:
# Count how many passengers were between the ages of 21 and 30
young_adult_passengers = df.loc[(df.Age > 21) & (df.Age < 30)]
len(young_adult_passengers)

279

In [16]:
young_adult_passengers.head()

Unnamed: 0.1,Unnamed: 0,Cabin,Embarked,Fare,Pclass,Ticket,Age,Name,Parch,Sex,SibSp,Survived
0,0,,S,7.25,3,A/5 21171,22.0,"Braund, Mr. Owen Harris",0,male,1,0.0
2,2,,S,7.925,3,STON/O2. 3101282,26.0,"Heikkinen, Miss. Laina",0,female,0,1.0
8,8,,S,11.1333,3,347742,27.0,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,female,0,1.0
23,23,A6,S,35.5,1,113788,28.0,"Sloper, Mr. William Thompson",0,male,0,1.0
34,34,,C,82.1708,1,PC 17604,28.0,"Meyer, Mr. Edgar Joseph",0,male,1,0.0


In [17]:
# Find the passengers who were either first- or third-class ticket holders
df.loc[(df.Pclass == 1) | (df.Pclass == 3)]

Unnamed: 0.1,Unnamed: 0,Cabin,Embarked,Fare,Pclass,Ticket,Age,Name,Parch,Sex,SibSp,Survived
0,0,,S,7.2500,3,A/5 21171,22.0,"Braund, Mr. Owen Harris",0,male,1,0.0
1,1,C85,C,71.2833,1,PC 17599,38.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,female,1,1.0
2,2,,S,7.9250,3,STON/O2. 3101282,26.0,"Heikkinen, Miss. Laina",0,female,0,1.0
3,3,C123,S,53.1000,1,113803,35.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,female,1,1.0
4,4,,S,8.0500,3,373450,35.0,"Allen, Mr. William Henry",0,male,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1304,,S,8.0500,3,A.5. 3236,,"Spector, Mr. Woolf",0,male,0,
1305,1305,C105,C,108.9000,1,PC 17758,39.0,"Oliva y Ocana, Dona. Fermina",0,female,0,
1306,1306,,S,7.2500,3,SOTON/O.Q. 3101262,38.5,"Saether, Mr. Simon Sivertsen",0,male,0,
1307,1307,,S,8.0500,3,359309,,"Ware, Mr. Frederick",0,male,0,


In [18]:
# Find the passengers who were not holders of either first- or third-class tickets
df.loc[~((df.Pclass == 1) | (df.Pclass == 3))]

Unnamed: 0.1,Unnamed: 0,Cabin,Embarked,Fare,Pclass,Ticket,Age,Name,Parch,Sex,SibSp,Survived
9,9,,C,30.0708,2,237736,14.0,"Nasser, Mrs. Nicholas (Adele Achem)",0,female,1,1.0
15,15,,S,16.0000,2,248706,55.0,"Hewlett, Mrs. (Mary D Kingcome)",0,female,0,1.0
17,17,,S,13.0000,2,244373,,"Williams, Mr. Charles Eugene",0,male,0,1.0
20,20,,S,26.0000,2,239865,35.0,"Fynney, Mr. Joseph J",0,male,0,0.0
21,21,D56,S,13.0000,2,248698,34.0,"Beesley, Mr. Lawrence",0,male,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1278,1278,,S,13.0000,2,244346,57.0,"Ashby, Mr. John",0,male,0,
1284,1284,,S,10.5000,2,C.A. 30769,47.0,"Gilbert, Mr. William",0,male,0,
1292,1292,,S,21.0000,2,28664,38.0,"Gale, Mr. Harry",0,male,1,
1296,1296,D38,C,13.8625,2,SC/PARIS 2166,20.0,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",0,male,0,


In [19]:
# Deleting the 'Unnamed: 0' column
del df['Unnamed: 0']
df.head()

Unnamed: 0,Cabin,Embarked,Fare,Pclass,Ticket,Age,Name,Parch,Sex,SibSp,Survived
0,,S,7.25,3,A/5 21171,22.0,"Braund, Mr. Owen Harris",0,male,1,0.0
1,C85,C,71.2833,1,PC 17599,38.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,female,1,1.0
2,,S,7.925,3,STON/O2. 3101282,26.0,"Heikkinen, Miss. Laina",0,female,0,1.0
3,C123,S,53.1,1,113803,35.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,female,1,1.0
4,,S,8.05,3,373450,35.0,"Allen, Mr. William Henry",0,male,0,0.0


In [20]:
# Getting statistics about the data
df.describe()

Unnamed: 0,Fare,Pclass,Age,Parch,SibSp,Survived
count,1308.0,1309.0,1046.0,1309.0,1309.0,891.0
mean,33.295479,2.294882,29.881138,0.385027,0.498854,0.383838
std,51.758668,0.837836,14.413493,0.86556,1.041658,0.486592
min,0.0,1.0,0.17,0.0,0.0,0.0
25%,7.8958,2.0,21.0,0.0,0.0,0.0
50%,14.4542,3.0,28.0,0.0,0.0,0.0
75%,31.275,3.0,39.0,0.0,1.0,1.0
max,512.3292,3.0,80.0,9.0,8.0,1.0


In [21]:
df.count()

Cabin        295
Embarked    1307
Fare        1308
Pclass      1309
Ticket      1309
Age         1046
Name        1309
Parch       1309
Sex         1309
SibSp       1309
Survived     891
dtype: int64

In [22]:
df.describe(include='all')

Unnamed: 0,Cabin,Embarked,Fare,Pclass,Ticket,Age,Name,Parch,Sex,SibSp,Survived
count,295,1307,1308.0,1309.0,1309,1046.0,1309,1309.0,1309,1309.0,891.0
unique,186,3,,,929,,1307,,2,,
top,C23 C25 C27,S,,,CA. 2343,,"Kelly, Mr. James",,male,,
freq,6,914,,,11,,2,,843,,
mean,,,33.295479,2.294882,,29.881138,,0.385027,,0.498854,0.383838
std,,,51.758668,0.837836,,14.413493,,0.86556,,1.041658,0.486592
min,,,0.0,1.0,,0.17,,0.0,,0.0,0.0
25%,,,7.8958,2.0,,21.0,,0.0,,0.0,0.0
50%,,,14.4542,3.0,,28.0,,0.0,,0.0,0.0
75%,,,31.275,3.0,,39.0,,0.0,,1.0,1.0


In [23]:
# Using groupby
embarked_grouped = df.groupby('Embarked')
print(f'There are {len(embarked_grouped)} Embarked groups')

There are 3 Embarked groups


In [24]:
# what is exactly groupby method does?
#It create a dictionary where the keys are the groups and the values are the rows that belong to the group
embarked_grouped.groups

{'C': Int64Index([   1,    9,   19,   26,   30,   31,   34,   36,   39,   42,
             ...
             1260, 1262, 1266, 1288, 1293, 1295, 1296, 1298, 1305, 1308],
            dtype='int64', length=270),
 'Q': Int64Index([   5,   16,   22,   28,   32,   44,   46,   47,   82,  109,
             ...
             1206, 1249, 1271, 1272, 1279, 1287, 1290, 1299, 1301, 1302],
            dtype='int64', length=123),
 'S': Int64Index([   0,    2,    3,    4,    6,    7,    8,   10,   11,   12,
             ...
             1289, 1291, 1292, 1294, 1297, 1300, 1303, 1304, 1306, 1307],
            dtype='int64', length=914)}

In [25]:
# we can iterate through embarked_grouped and execute computations on the individual groups
for name, group in embarked_grouped:
    print(name, group.Age.mean())

C 32.33216981132075
Q 28.63
S 29.245204603580564


In [26]:
# Using the agg method to determine the mean of each group
embarked_grouped.agg(np.mean)

Unnamed: 0_level_0,Fare,Pclass,Age,Parch,SibSp,Survived
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C,62.336267,1.851852,32.33217,0.37037,0.4,0.553571
Q,12.409012,2.894309,28.63,0.113821,0.341463,0.38961
S,27.418824,2.347921,29.245205,0.426696,0.550328,0.336957


In [27]:
# We can also pass multiple functions to agg via a list to apply the functions across the dataset
embarked_grouped.agg([lambda x: x.values[0], np.mean, np.std])

Unnamed: 0_level_0,Fare,Fare,Fare,Pclass,Pclass,Pclass,Age,Age,Age,Parch,Parch,Parch,SibSp,SibSp,SibSp,Survived,Survived,Survived
Unnamed: 0_level_1,<lambda_0>,mean,std,<lambda_0>,mean,std,<lambda_0>,mean,std,<lambda_0>,mean,std,<lambda_0>,mean,std,<lambda_0>,mean,std
Embarked,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
C,71.2833,62.336267,84.185996,1,1.851852,0.936802,38.0,32.33217,15.258092,0,0.37037,0.670579,1,0.4,0.554803,1.0,0.553571,0.498608
Q,8.4583,12.409012,13.616133,3,2.894309,0.380099,,28.63,15.045784,0,0.113821,0.531056,0,0.341463,0.885487,0.0,0.38961,0.49086
S,7.25,27.418824,37.096402,3,2.347921,0.784126,22.0,29.245205,14.047507,0,0.426696,0.943989,1,0.550328,1.161723,0.0,0.336957,0.473037


In [28]:
# Apply numpy.sum to the Fare column and the Lambda function to the Age column by passing agg a dictionary where the
# keys are the columns to apply the function to, and the values are the functions themselves to be able to apply
# different functions to different columns in the DataFrame
embarked_grouped.agg({
    'Fare': np.sum,
    'Age': lambda x: x.values[0]
})

Unnamed: 0_level_0,Fare,Age
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,16830.7922,38.0
Q,1526.3085,
S,25033.3862,22.0


In [29]:
# executing the groupby method using more than one column
age_embarked_grouped = df.groupby(['Sex', 'Embarked'])
age_embarked_grouped.groups

{('male',
  'S'): Int64Index([   0,    4,    6,    7,   12,   13,   17,   20,   21,   23,
             ...
             1283, 1284, 1285, 1289, 1292, 1294, 1297, 1304, 1306, 1307],
            dtype='int64', length=623),
 ('female',
  'C'): Int64Index([   1,    9,   19,   31,   39,   43,   52,  111,  114,  128,
             ...
             1238, 1241, 1252, 1255, 1259, 1262, 1266, 1288, 1293, 1305],
            dtype='int64', length=113),
 ('female',
  'S'): Int64Index([   2,    3,    8,   10,   11,   14,   15,   18,   24,   25,
             ...
             1265, 1267, 1273, 1274, 1276, 1282, 1286, 1291, 1300, 1303],
            dtype='int64', length=291),
 ('male',
  'Q'): Int64Index([   5,   16,   46,  116,  126,  143,  171,  188,  196,  214,  245,
              260,  278,  280,  301,  364,  388,  411,  421,  428,  459,  468,
              510,  517,  525,  552,  560,  613,  626,  629,  703,  718,  749,
              768,  776,  778,  787,  790,  825,  828,  890,  891,  893,  907,


## Missing Data

In [30]:
# counting NA values
df.agg(lambda x: x.isna().sum())

Cabin       1014
Embarked       2
Fare           1
Pclass         0
Ticket         0
Age          263
Name           0
Parch          0
Sex            0
SibSp          0
Survived     418
dtype: int64

In [31]:
# Dropping rows with NA values in the 'Fare' and 'Embarked' columns
df_valid = df.loc[(~df.Fare.isna()) & (~df.Embarked.isna())]

In [32]:
# fill in the missing age values with the mean age for the dataset
df_valid[['Age']] = df_valid[['Age']].fillna(df_valid.Age.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [33]:
# Checking the ages for each class
print('Class 1:', round(df_valid.loc[df.Pclass == 1, 'Age'].mean(), 2))
print('Class 2:', round(df_valid.loc[df.Pclass == 2, 'Age'].mean(), 2))
print('Class 3:', round(df_valid.loc[df.Pclass == 3, 'Age'].mean(), 2))

Class 1: 37.96
Class 2: 29.52
Class 3: 26.23


In [34]:
# What if we were to consider the sex of the person as well as ticket class (social status)?
#Do the average ages differ here too?
for name, grp in df_valid.groupby(['Pclass', 'Sex']):
    print('%i' % name[0], name[1], '%0.2f' % grp['Age'].mean())

1 female 36.30
1 male 39.27
2 female 27.56
2 male 30.74
3 female 24.45
3 male 27.02


In [35]:
# This combination of sex and class provides more resolution than simply filling in all missing fields with the mean age
mean_ages = df_valid.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.mean()))
df_valid.loc[:, 'Age'] = mean_ages

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


## Class Imbalande

In [36]:
print(f'People who not survided: {len(df.loc[df.Survived == 0])}')
print(f'People who survided: {len(df.loc[df.Survived == 1])}')

People who not survided: 549
People who survided: 342
