In [2]:
#Assistant: Eda Jovičić, eda.jovicic@fer.com

# Data handling

In this laboratory exercise, we will show some of the basic data manipulations that are often used in practice. For this purpose, we will use the Titanic data set because it is suitable for demonstrating many data transformations. We will use the random forest algorithm to classify the data so that we can track what impact certain data manipulations have on the classification model. 

In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [4]:
# Be very careful when using this setting! Not recommended for beginners!
import warnings
warnings.filterwarnings('ignore')

### Data loading

In [13]:
# loading data from text file
X = pd.read_csv("titanic.csv")

X

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Initial data survey

In [6]:
X.shape

(891, 12)

In [14]:
# display of the training set
X.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [8]:
X.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [9]:
X.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Monotonous attributes
Let's check are there any monotonous attributes, i.e. those attributes that have their values rise (or fall).

In [10]:
X.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

The nunique function counts unique values per column and can serve as a good indicator of monotonous attributes. In this case, the candidates for the monotonous attribute are PassengerId and Name because each record has a unique value. It is easy to conclude that PassengerId is a monotonous attribute and Name is not a monotonous attribute. Let's remove the PassengerId attribute from the data set.

In [15]:
X.drop(('PassengerId'), axis=1, inplace=True)
X.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Classification #1

If we try to make a classification using the current version of the data, an error will occur because the algorithms from the sclearn module work exclusively with numeric values.

In [12]:
# function definition that we'll use for classification throughout the whole notebook
def klasificiraj(df):
    # separate class label
    X = df.loc[:, df.columns != 'Survived']
    y = df.loc[:, 'Survived']
    
    ######## This is how model training looks like at an individual data set split ########
    
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    # instantiate and train model 
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    
    ######## The recommended way to validate a model is using cross-validation ########
    # cross-validation
    scores = cross_val_score(model, X, y, cv=20)
    
    
    # evaluate model
    print('Accuracy: ', scores.mean(), ' +- ', scores.std())
   
# call the defined function
klasificiraj(X)

ValueError: could not convert string to float: 'McCarthy, Mr. Timothy J'

The error occurred because the algorithms from the sclearn module work exclusively with numeric values. Let's see which of the values are numerical.

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [None]:
# let's make the classification that works exclusively with numerical features
X_tmp = X.loc[:,['Survived','Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
klasificiraj(X_tmp)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

### Missing data

Now we have a new data problem - missing data. Let's check how many such data samples there are in the set.

In [None]:
X.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In general, the options for troubleshooting missing values are:
 - replace with a mean value
 - completely remove those records from the dataset
 - completely eliminate these features from the data set
 - interpolation
 - find accurate values from other data sources
 - ...
 
In the case of Age feature: 
 - we will replace the missing years with a mean value
 - removing these records is not an option because we lose too much data (almost 20%)
 - removing this feature is not an option because the feature is too important - children (and women) are often saved first
 - interpolation in this case makes no sense
 - finding accurate values from other data sources is the best option, but we will not do it here for the sake of simplicity of the exercise
 
In the case of Cabin feature: 
 - it is not a numeric value so the mean value is not an option
 - deleting these records is not an option because we lose too much data (77%)
 - removing that feature is what we're going to do here
 - interpolation does not make sense because it is not a numerical value
 - finding accurate values from other data sources is the best option, but we will not do it here for the sake of simplicity of the exercise
 
In the case of Embarked feature (the harbor of embarkment):
 - it is not a numeric value so the mean value is not an option
 - removing these records is what we're going to do here
 - removing this feature makes no sense because we lose the whole feature due to only two examples with missing values
 - interpolation does not make sense because it is not a numerical value
 - finding accurate values from other data sources is the best option, but we will not do it here for the sake of simplicity of the exercise

In [None]:
X_tmp = X.copy()

# insert mean value of age where this information is missing
X_tmp.loc[X_tmp.Age.isna(),'Age'] = X_tmp.loc[:,'Age'].mean() 

# remove the Cabin column
X_tmp.drop(['Cabin'], axis=1, inplace=True)

# remove records with missing values of Embarked feature
X_cleaned = X_tmp.loc[X_tmp.Embarked.notnull(), :]

In [None]:
# we classify exlusively with numeric features
X_tmp = X_tmp.loc[:,['Survived','Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
klasificiraj(X_tmp)

Točnost:  0.6886363636363637  +-  0.058314964968449595


We managed to make the first classification with an accuracy ("Točnost") of 68.8%. Let’s look further at the data to see if we can improve that result.

### Outliers

Let's check whether there are outliers in the data set.

In [None]:
X_tmp.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,2.311586,29.653446,0.524184,0.382452,32.096681
std,0.48626,0.8347,12.968366,1.103705,0.806761,49.697504
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.8958
50%,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


Based on this printout, the candidates for the features with outliers are SibSp, Parch, and Fare. If we look at SibSp and Parch, the maximum values bounce a lot, but they are not impossible. Let’s take a look at the Fare feature additionally.

In [None]:
print('Fare > 100: ', (X_tmp.Fare > 100).sum())
print('Fare > 200: ', (X_tmp.Fare > 200).sum())
print('Fare > 300: ', (X_tmp.Fare > 300).sum())
print('Fare > 400: ', (X_tmp.Fare > 400).sum())
print('Fare > 500: ', (X_tmp.Fare > 500).sum())

Fare > 100:  53
Fare > 200:  20
Fare > 300:  3
Fare > 400:  3
Fare > 500:  3


In [None]:
X_tmp.loc[X_tmp.Fare > 500, 'Fare']

258    512.3292
679    512.3292
737    512.3292
Name: Fare, dtype: float64

These three records are very different from the other records and can be considered as outliers. For now, we will not do anything about it, but you should be aware of these facts when modeling.
NOTE: Visualizations are a good way to detect outliers, and they will be discussed more in the next lab.

### Inconsistent data
Any free entry of text often leads to inconsistencies in the data. For example, the title Ms is synonymous with the title Miss. Let’s check which titles all exist in our data set (using a regular expression) and to which sex they belong.

In [None]:
# let's create a new column called Title by using a regular expression
X['Title'] = X.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# let's print out the frequencies in the new column data by sex
pd.crosstab(X['Title'], X['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


There does not appear to be much inconsistency in this data set when it comes to entering passenger titles. We will transfer the titles Mlle and Ms to the group Miss (3 records in total) and the title Mme to the group Mrs (1 record).
Rarely appearing titles can all be grouped into one group that we will call Rare.

In [None]:
# create title Rare
X['Title'] = X['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
# switch inconsistent titles
X['Title'] = X['Title'].replace('Mlle', 'Miss')
X['Title'] = X['Title'].replace('Ms', 'Miss')
X['Title'] = X['Title'].replace('Mme', 'Mrs')

Let’s show the survival rate depending on the passenger title.

In [None]:
X[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


We see that the title has a big impact on the chance of survival, which makes the title an essential feature for prediction. The title is currently a text-shaped feature. Let’s transform the title into a numeric value so we can use it when classifying.

### Sparse data - OneHotEncoder
We will transform the title as a feature using OneHotEncoder, which will result in five new columns (features). For the new columns, each record will have a value of 1 in only one of these five columns, and will have a value of 0 in the remaining four columns. See the example below.

In [None]:
# we define the encoder
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(X[['Title']])
# we transform the data
X_tmp = encoder.transform(X[['Title']])
X_tmp = pd.DataFrame(X_tmp.toarray(), columns=encoder.get_feature_names(['Title']))
X_tmp

Unnamed: 0,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...
886,0.0,0.0,0.0,0.0,1.0
887,0.0,1.0,0.0,0.0,0.0
888,0.0,1.0,0.0,0.0,0.0
889,0.0,0.0,1.0,0.0,0.0


Let us now merge the resulting sparse table with our original table and see if the new features improve the accuracy of the prediction.

In [None]:
# merge (concatenate) data
X = pd.concat((X, X_tmp), axis=1)

# remove nonnumeric columns - ~ in this case demarks "not", i.e. the condition would be read as "columns not in ['Name', ...]"
X_tmp = X.loc[:, ~X.columns.isin(['Name','Sex', 'Ticket', 'Cabin', 'Embarked', 'Title'])].copy()

# let us reinject the mean value of Age because we have previously calculated it only in a temporary DataFrame
X_tmp.loc[X_tmp.Age.isna(),'Age'] = X_tmp.loc[:,'Age'].mean()

# classify
klasificiraj(X_tmp)

Točnost:  0.8182828282828283  +-  0.0684515055449198


We see that the insertion of the title as a rare feature resulted in an increase in the accuracy of the model from 68.8% to 81.8%.

### LabelEncoder
So far, we haven’t used the sex feature because it’s in text format. Let's change the feature format using LabelEncoder and repeat the classification.

In [17]:
# initialization of the encoder
le = LabelEncoder()
le.fit(X.loc[:,'Sex'])

X.loc[:,'Sex'] = le.transform(X.loc[:,'Sex'])

In [18]:
X.info()
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    int32  
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int32(1), int64(4), object(4)
memory usage: 73.2+ KB


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [None]:
# remove nonnumeric columns - ~ in this case demarks "not", i.e. the condition would be read as "columns not in ['Name', ...]"
X_tmp = X.loc[:, ~X.columns.isin(['Name', 'Ticket', 'Cabin', 'Embarked', 'Title'])].copy()

# let us reinject the mean value of Age because we have previously calculated it only in a temporary DataFrame
X_tmp.loc[X_tmp.Age.isna(),'Age'] = X_tmp.loc[:,'Age'].mean()
    
# classify
klasificiraj(X_tmp)

Točnost:  0.8261363636363637  +-  0.06595268789138127


Adding sex as a feature increased the accuracy of the model from 81.8% to 82.6%. This was also expected because it is known that women were the first to board lifeboats. Let’s check the sex-dependent survival rate.

In [None]:
X[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

Unnamed: 0,Sex,Survived
0,0,0.742038
1,1,0.188908


We see how gender very well separates survivors from the dead.
  
Let’s do the same thing with the Embarked feature.

In [None]:
# remove records with the missing Embarked value
X = X.loc[X.Embarked.notnull(), :]

# initialize the encoder
le = LabelEncoder()
le.fit(X.loc[:,'Embarked'])

# transform 
X.loc[:,'Embarked'] = le.transform(X.loc[:,'Embarked'])

In [None]:
X[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

Unnamed: 0,Embarked,Survived
0,0,0.553571
1,1,0.38961
2,2,0.336957


In [None]:
# remove nonnumeric columns - ~ in this case demarks "not", i.e. the condition would be read as "columns not in ['Name', ...]"
X_tmp = X.loc[:, ~X.columns.isin(['Name', 'Ticket', 'Cabin', 'Title'])].copy()

# let us reinject the mean value of Age because we have previously calculated it only in a temporary DataFrame
X_tmp.loc[X_tmp.Age.isna(),'Age'] = X_tmp.loc[:,'Age'].mean()
    
# classify
klasificiraj(X_tmp)

Točnost:  0.8133585858585859  +-  0.05677191719127784


We now have a decrease in accuracy although the probability of survival depends on the Embarked feature. Nevertheless, we will leave the Embarked feature in the data set. For now, our goal is to create as many good features as possible, and in the later stages, a selection of features can be carried out if the need arises.

Let’s rethink the way we filled in the missing values for the Age feature. Now that we have readily available information about a person’s title, we can try to fill in the missing Age values smarter. For example, it is expected that the average Age of a person with the title of Miss will be less than the average Age of a person with the title of Mrs. Let's check.

In [None]:
title_age = X[['Title', 'Age']].groupby(['Title'], as_index=False).mean()
title_age

Unnamed: 0,Title,Age
0,Master,4.574167
1,Miss,21.736486
2,Mr,32.36809
3,Mrs,35.546296
4,Rare,45.545455


We fill in the missing age values in relation to the passenger title and check whether this leads to a further improvement in the accuracy of the model.

In [None]:
# remove nonnumeric columns - ~ in this case demarks "not", i.e. the condition would be read as "columns not in ['Name', ...]"
X_tmp = X.loc[:, ~X.columns.isin(['PassengerId', 'Name','Sex', 'Ticket', 'Cabin', 'Embarked', 'Title'])].copy()

# fill in the missing values
def popuni_nedostajucu_dob(X_tmp):
    mask = (X_tmp.Title_Master == 1) & (X_tmp.Age.isna())
    X_tmp.loc[mask, 'Age'] = title_age.loc[title_age.Title == 'Master', 'Age'].values[0]
    mask = (X_tmp.Title_Miss == 1) & (X_tmp.Age.isna())
    X_tmp.loc[mask, 'Age'] = title_age.loc[title_age.Title == 'Miss', 'Age'].values[0]
    mask = (X_tmp.Title_Mr == 1) & (X_tmp.Age.isna())
    X_tmp.loc[mask, 'Age'] = title_age.loc[title_age.Title == 'Mr', 'Age'].values[0]
    mask = (X_tmp.Title_Mrs == 1) & (X_tmp.Age.isna())
    X_tmp.loc[mask, 'Age'] = title_age.loc[title_age.Title == 'Mrs', 'Age'].values[0]
    mask = (X_tmp.Title_Rare == 1) & (X_tmp.Age.isna())
    X_tmp.loc[mask, 'Age'] = title_age.loc[title_age.Title == 'Rare', 'Age'].values[0]
    return X_tmp

X_tmp = popuni_nedostajucu_dob(X_tmp)
    
# classify
klasificiraj(X_tmp)

Točnost:  0.814570707070707  +-  0.06399176749003999


### Feature engineering
Feature engineering is a process by which, using knowledge about a domain, one tries to select or transform the most important variables (features) from the prepared data set with the aim of successful modeling.
The process we conducted earlier to obtain the passenger title as a feature could be considered feature engineering. Below we show a few more examples.
  
Let’s create a new feature that will discretize age into five categories - AgeBand

In [None]:
# fill in the missing age
X_tmp = popuni_nedostajucu_dob(X)

# let's create AgeBand
X_tmp['AgeBand'] = pd.cut(X_tmp['Age'], 5)
# let's show its dependency with Survived
X_tmp[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 16.336]",0.548077
1,"(16.336, 32.252]",0.39267
2,"(32.252, 48.168]",0.317901
3,"(48.168, 64.084]",0.426471
4,"(64.084, 80.0]",0.090909


In [None]:
# let's apply LabelEncoder to AgeBand
le = LabelEncoder()
le.fit(X.loc[:,'AgeBand'])

# transform 
X.loc[:,'AgeBand'] = le.transform(X.loc[:,'AgeBand'])

Using the AgeBand feature, we will create a new artificial feature called Age * Pclass 

In [None]:
X['Age*Pclass'] = X.AgeBand * X.Pclass

X[['Age*Pclass', 'Survived']].groupby(['Age*Pclass'], as_index=False).mean().sort_values(by='Age*Pclass', ascending=True)

Unnamed: 0,Age*Pclass,Survived
0,0,0.548077
1,1,0.733333
2,2,0.530055
3,3,0.323741
4,4,0.390625
5,6,0.139785
6,8,0.0
7,9,0.111111
8,12,0.0


We will now create a new FamilySize feature that will be the sum of SibSp and Parch increased by 1 (including that person)

In [None]:
# let's create the new feature
X['FamilySize'] = X['SibSp'] + X['Parch'] + 1

# dependency between Survived and the new feature
X[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='FamilySize', ascending=True)

Unnamed: 0,FamilySize,Survived
0,1,0.300935
1,2,0.552795
2,3,0.578431
3,4,0.724138
4,5,0.2
5,6,0.136364
6,7,0.333333
7,8,0.0
8,11,0.0


Using the new FamilySize feature, we can create another new IsAlone feature.

In [None]:
# let's create the new feature
X['IsAlone'] = 0
X.loc[X.loc[:,'FamilySize'] == 1, 'IsAlone'] = 1

# dependency between Survived and the new feature
X[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean().sort_values(by='IsAlone', ascending=True)

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.300935


In [None]:
# remove nonnumeric columns - ~ in this case demarks "not", i.e. the condition would be read as "columns not in ['Name', ...]"
X_tmp = X.loc[:, ~X.columns.isin(['PassengerId', 'Name','Sex', 'Ticket', 'Cabin', 'Embarked', 'Title'])].copy()

# classify
klasificiraj(X_tmp)

Točnost:  0.8156818181818182  +-  0.06872742322532945


Let us now check the correlations between the features. Note: the correlation matrix display is much better using heatmap functions, but we will not use it here because it is the topic of the next exercise.

In [None]:
X.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare,AgeBand,Age*Pclass,FamilySize,IsAlone
Survived,1.0,-0.335549,-0.541585,-0.094724,-0.03404,0.083151,0.25529,-0.169718,0.085998,0.334953,-0.547689,0.34087,-0.011611,-0.114969,-0.322618,0.018277,-0.206207
Pclass,-0.335549,1.0,0.127741,-0.340679,0.081656,0.016824,-0.548193,0.164681,0.081547,-0.007761,0.139156,-0.151078,-0.188273,-0.303354,0.378862,0.064221,0.138553
Sex,-0.541585,0.127741,1.0,0.123741,-0.116348,-0.247508,-0.179958,0.11032,0.159612,-0.694744,0.866888,-0.550071,0.0753,0.155596,0.232564,-0.203191,0.306985
Age,-0.094724,-0.340679,0.123741,1.0,-0.267239,-0.195976,0.088094,-0.026133,-0.412006,-0.307473,0.237141,0.178464,0.194984,0.930825,0.607464,-0.280584,0.194706
SibSp,-0.03404,0.081656,-0.116348,-0.267239,1.0,0.414542,0.160887,0.0689,0.349434,0.084446,-0.252201,0.063003,-0.026055,-0.254518,-0.23951,0.890654,-0.584186
Parch,0.083151,0.016824,-0.247508,-0.195976,0.414542,1.0,0.217532,0.040449,0.267194,0.102026,-0.335765,0.225519,-0.059725,-0.200341,-0.180098,0.782988,-0.583112
Fare,0.25529,-0.548193,-0.179958,0.088094,0.160887,0.217532,1.0,-0.226311,0.01139,0.118352,-0.181692,0.105511,0.016645,0.080627,-0.276558,0.218658,-0.274079
Embarked,-0.169718,0.164681,0.11032,-0.026133,0.0689,0.040449,-0.226311,1.0,0.031413,-0.096519,0.101336,-0.036499,-0.029671,-0.049476,0.078341,0.067305,0.062532
Title_Master,0.085998,0.081547,0.159612,-0.412006,0.349434,0.267194,0.01139,0.031413,1.0,-0.11089,-0.255888,-0.087798,-0.035374,-0.371027,-0.330774,0.37235,-0.26684
Title_Miss,0.334953,-0.007761,-0.694744,-0.307473,0.084446,0.102026,0.118352,-0.096519,-0.11089,1.0,-0.602266,-0.206644,-0.083257,-0.315325,-0.30375,0.108698,-0.049521


We see that the FamilySize feature is highly correlated with the SibSp and Parch features which makes these two features a candidate for removal and reducing the dimensionality of the set.
  
We won't remove them for now. We will show more filter methods of feature selection.
  
The first method of feature selection is the use of mutual information. The mutual_info_classif function assigns its "importance" to each feature, and the SelectKBest class selects k = 7 of the most important features. An example is shown below.

In [None]:
kbest = SelectKBest(mutual_info_classif, k=7)

y_ = X_tmp.loc[:, 'Survived']
X_ = X_tmp.loc[:, X_tmp.columns != 'Survived']

kbest.fit(X_, y_)
X_.columns[kbest.get_support()].values

array(['Pclass', 'Age', 'Fare', 'Title_Mr', 'Title_Mrs', 'Age*Pclass',
       'FamilySize'], dtype=object)

In [None]:
columns = np.concatenate((X_.columns[kbest.get_support()].values, np.asarray(['Survived'])))
klasificiraj(X_tmp[columns])

Točnost:  0.8077020202020202  +-  0.05387444425988824


The features 'Pclass', 'Age', 'Fare', 'Title_Mr', 'Title_Mrs', 'Age\*Pclass', 'FamilySize' are selected. Accuracy using only these features is slightly lower than before, but the model is simpler and the training time is shorter. In this example, this is not so significant because the number of features is reduced from 15 to 7. On real projects, there are often several thousand features and several million records and a slight decrease in accuracy is considered a reasonable price to pay to significantly reduce training time.

The SelectKBest class receives as an argument a method that ranks features in a data set. Any self-defined method can be used for this purpose. The following example shows the use of the internal feature ranking of the ExtraTreesClassifier algorithm for feature selection.

In [None]:
def calc_extr_scores(X,y):
    cls = ExtraTreesClassifier()
    cls.fit(X,y)
    return cls.feature_importances_

kbest = SelectKBest(calc_extr_scores, k=7)
kbest.fit(X_, y_)
X_.columns[kbest.get_support()].values

array(['Pclass', 'Age', 'Fare', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'FamilySize'], dtype=object)

In [None]:
columns = np.concatenate((X_.columns[kbest.get_support()].values, np.asarray(['Survived'])))
klasificiraj(X_tmp[columns])

Točnost:  0.8202272727272726  +-  0.06397366814795333


In this case, the reduction in dimensionality led to a small increase in accuracy. There is only one difference in the selected features - instead of 'Age\*Pclass', we now have 'Title_Miss'.