***

#### Downloading the Titanic Competition dataset using the opendatasets python module

In [1]:
!pip install opendatasets --quiet

In [2]:
URL = 'https://www.kaggle.com/c/titanic'

In [3]:
import opendatasets as od

In [5]:
od.download(URL)

Skipping, found downloaded files in "./titanic" (use force=True to force download)


In [6]:
import os

In [7]:
os.listdir('titanic')

['gender_submission.csv', 'test.csv', 'train.csv']

In [8]:
import pandas as pd


In [9]:
train_df = pd.read_csv('titanic/train.csv')
test_df = pd.read_csv('titanic/test.csv')

In [10]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [12]:
train_df.describe(include=['int64', 'object', 'float64'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [13]:
train_df.isna().sum().sort_values(ascending=False)

Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
SibSp            0
Parch            0
Ticket           0
Fare             0
dtype: int64

In [14]:
train_df.duplicated().sum()

0

### Feature Engineering

In [15]:
# We drop these features since they have little to no significance in predicting the target variable (survived)

train_df.drop('PassengerId', axis=1, inplace=True)
train_df.drop('Name', axis=1, inplace=True)
train_df.drop('Ticket', axis=1, inplace=True)
train_df.drop('Cabin', axis=1, inplace=True)

In [16]:
# We replace observations (strings) of these features with integers, since machine learning algorithms work with numbers

train_df.Sex = train_df.Sex.replace(['male', 'female'],[1, 0])
train_df.Embarked = train_df.Embarked.replace(['C', 'Q', 'S'],[0, 1, 2])

In [17]:
# We create a new column of the size of each family onboarding the Titanic.

train_df['Family_size'] = train_df['SibSp']+train_df['Parch'] + 1

In [18]:
# We drop this features to avoid the problem of autocorrelation.

train_df.drop('SibSp', axis=1, inplace=True)
train_df.drop('Parch', axis=1, inplace=True)

In [19]:
train_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Family_size
Survived,1.0,-0.338481,-0.543351,-0.077221,0.257307,-0.169718,0.016639
Pclass,-0.338481,1.0,0.1319,-0.369226,-0.5495,0.164681,0.065997
Sex,-0.543351,0.1319,1.0,0.093254,-0.182333,0.11032,-0.200988
Age,-0.077221,-0.369226,0.093254,1.0,0.096067,-0.032565,-0.301914
Fare,0.257307,-0.5495,-0.182333,0.096067,1.0,-0.226311,0.217138
Embarked,-0.169718,0.164681,0.11032,-0.032565,-0.226311,1.0,0.067305
Family_size,0.016639,0.065997,-0.200988,-0.301914,0.217138,0.067305,1.0


In [20]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)

In [21]:
train_df.AgeBand.unique()

[(16.336, 32.252], (32.252, 48.168], NaN, (48.168, 64.084], (0.34, 16.336], (64.084, 80.0]]
Categories (5, interval[float64, right]): [(0.34, 16.336] < (16.336, 32.252] < (32.252, 48.168] < (48.168, 64.084] < (64.084, 80.0]]

In [22]:
train_df.loc[train_df['Age'] <= 16, 'Age'] = 0
train_df.loc[(train_df['Age'] > 16) & (train_df['Age'] <= 32), 'Age'] = 1
train_df.loc[(train_df['Age'] > 32) & (train_df['Age'] <= 48), 'Age'] = 2
train_df.loc[(train_df['Age'] > 48) & (train_df['Age'] <= 64), 'Age'] = 3
train_df.loc[train_df['Age'] > 64, 'Age']

33     66.0
54     65.0
96     71.0
116    70.5
280    65.0
456    65.0
493    71.0
630    80.0
672    70.0
745    70.0
851    74.0
Name: Age, dtype: float64

In [23]:
train_df = train_df.drop(['AgeBand'], axis=1)

In [24]:
train_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Family_size
Survived,1.0,-0.338481,-0.543351,-0.080312,0.257307,-0.169718,0.016639
Pclass,-0.338481,1.0,0.1319,-0.112283,-0.5495,0.164681,0.065997
Sex,-0.543351,0.1319,1.0,0.100093,-0.182333,0.11032,-0.200988
Age,-0.080312,-0.112283,0.100093,1.0,-0.001315,-0.051638,-0.083842
Fare,0.257307,-0.5495,-0.182333,-0.001315,1.0,-0.226311,0.217138
Embarked,-0.169718,0.164681,0.11032,-0.051638,-0.226311,1.0,0.067305
Family_size,0.016639,0.065997,-0.200988,-0.083842,0.217138,0.067305,1.0


In [25]:
# Since the correlation between the Family_size and Survived features is minimal, we will create a new feature IsAlone to categorize passengers into those who travelled as a family and those who travelled alone.

train_df['IsAlone'] = 0
train_df.loc[train_df['Family_size'] == 1, 'IsAlone'] = 1

In [26]:
# We drop this feature to avoid the problem of autocorrelation.

train_df = train_df.drop(['Family_size'], axis=1)

In [27]:
train_df['FareBand'] = pd.cut(train_df['Fare'], 4)

In [28]:
train_df.loc[train_df['Fare'] <= 7.91, 'Fare'] = 0
train_df.loc[(train_df['Fare'] > 7.91) & (train_df['Fare'] <= 14.454), 'Fare'] = 1
train_df.loc[(train_df['Fare'] > 14.454) & (train_df['Fare'] <= 31), 'Fare']   = 2
train_df.loc[train_df['Fare'] > 31, 'Fare'] = 3
train_df['Fare'] = train_df['Fare'].astype(int)

In [29]:
train_df = train_df.drop(['FareBand'], axis=1)

In [30]:
train_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,IsAlone
Survived,1.0,-0.338481,-0.543351,-0.080312,0.295875,-0.169718,-0.203367
Pclass,-0.338481,1.0,0.1319,-0.112283,-0.628459,0.164681,0.135207
Sex,-0.543351,0.1319,1.0,0.100093,-0.24894,0.11032,0.303646
Age,-0.080312,-0.112283,0.100093,1.0,0.012365,-0.051638,0.080084
Fare,0.295875,-0.628459,-0.24894,0.012365,1.0,-0.114286,-0.568942
Embarked,-0.169718,0.164681,0.11032,-0.051638,-0.114286,1.0,0.062532
IsAlone,-0.203367,0.135207,0.303646,0.080084,-0.568942,0.062532,1.0


#### We separate our target variable from the input variables...

In [31]:
train_inputs = train_df.columns[1:]

In [32]:
train_inputs = train_df[train_inputs].copy()

In [33]:
train_inputs

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,IsAlone
0,3,1,1.0,0,2.0,0
1,1,0,2.0,3,0.0,0
2,3,0,1.0,1,2.0,1
3,1,0,2.0,3,2.0,0
4,3,1,2.0,1,2.0,1
...,...,...,...,...,...,...
886,2,1,1.0,1,2.0,1
887,1,0,1.0,2,2.0,1
888,3,0,,2,2.0,0
889,1,1,1.0,2,0.0,1


In [34]:
target = train_df.columns[0]

In [35]:
target = train_df[target]

In [36]:
target

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [37]:
numeric_cols = train_inputs.select_dtypes(include=['int64', 'float64']).columns.to_list()

## Imputing Numerical Data

In [38]:
!pip install scikit-learn --upgrade --quiet

In [39]:
from sklearn.impute import SimpleImputer

In [40]:
imputer = SimpleImputer(strategy='median')

In [41]:
imputer.fit(train_inputs[numeric_cols])

In [42]:
train_inputs[numeric_cols] = imputer.transform(train_inputs[numeric_cols])

In [43]:
train_inputs[numeric_cols].isna().sum()

Pclass      0
Sex         0
Age         0
Fare        0
Embarked    0
IsAlone     0
dtype: int64

## Feature Scaling

- Feature scaling is a method used to normalize the range of inputs
variables or features of data.

In [44]:
from sklearn.preprocessing import MinMaxScaler

In [45]:
scaler = MinMaxScaler()

In [46]:
scaler.fit(train_inputs[numeric_cols])

In [47]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])

In [48]:
train_inputs[numeric_cols].describe().loc[['min', 'max']]

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,IsAlone
min,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [49]:
train_inputs[numeric_cols]

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,IsAlone
0,1.0,1.0,0.0125,0.000000,1.0,0.0
1,0.0,0.0,0.0250,1.000000,0.0,0.0
2,1.0,0.0,0.0125,0.333333,1.0,1.0
3,0.0,0.0,0.0250,1.000000,1.0,0.0
4,1.0,1.0,0.0250,0.333333,1.0,1.0
...,...,...,...,...,...,...
886,0.5,1.0,0.0125,0.333333,1.0,1.0
887,0.0,0.0,0.0125,0.666667,1.0,1.0
888,1.0,0.0,0.0125,0.666667,1.0,0.0
889,0.0,1.0,0.0125,0.666667,0.0,1.0


***