# 0 - Import operations

In [320]:
import pandas as pd

In [321]:
# Visualisation
pd.set_option('display.max_columns', None) # Show all columns
pd.set_option('display.max_rows', None) # Show all rows

# 1 - Data import 

In [322]:
dataTrain = pd.read_csv('../input/titanic/train.csv')
dataTest = pd.read_csv('../input/titanic/test.csv')

dataTrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [323]:
X_train = pd.concat([dataTrain, dataTest])

# 2 - Data transformation

## Null values

In [324]:
# Column with null values in X_train
X_train_sum = X_train.isnull().sum()
#Percentage of null values in X_train
X_train_percent = X_train.isnull().sum()/X_train.isnull().count()*100
# Datatype of X_train
X_train_dtypes = X_train.dtypes

#Concatenating X_train_sum, X_train_percent and X_train_dtypes
missing_data = pd.concat([X_train_sum, X_train_percent, X_train_dtypes], axis=1, keys=['Null values', '%', 'Types'])
missing_data

Unnamed: 0,Null values,%,Types
PassengerId,0,0.0,int64
Survived,418,31.932773,float64
Pclass,0,0.0,int64
Name,0,0.0,object
Sex,0,0.0,object
Age,263,20.091673,float64
SibSp,0,0.0,int64
Parch,0,0.0,int64
Ticket,0,0.0,object
Fare,1,0.076394,float64


## Unique values

In [325]:
# Unique values in X_train
X_train_unique = X_train.nunique()
X_train_unique

PassengerId    1309
Survived          2
Pclass            3
Name           1307
Sex               2
Age              98
SibSp             7
Parch             8
Ticket          929
Fare            281
Cabin           186
Embarked          3
dtype: int64

In [326]:
#Unique values in Survived, Pclass, Sex, SibSp, Parch, Embarked
print(X_train['Survived'].unique()) # Print unique values of Survived column
print(X_train['Pclass'].unique()) # Print unique values of Pclass column
print(X_train['Sex'].unique()) # Print unique values of Sex column
print(X_train['SibSp'].unique()) # Print unique values of SibSp column
print(X_train['Parch'].unique()) # Print unique values of Parch column
print(X_train['Embarked'].unique()) # Print unique values of Embarked column


[ 0.  1. nan]
[3 1 2]
['male' 'female']
[1 0 3 4 2 5 8]
[0 1 2 5 3 4 6 9]
['S' 'C' 'Q' nan]


## OHE for Sex Feature

In [327]:
#One hot enconding for Sex column
ohe_sex = pd.get_dummies(X_train['Sex'], prefix='Sex').astype(bool).astype(int)
X_train.drop('Sex', axis=1, inplace=True)
# Concatenate ohe_sex with X_train
X_train = pd.concat([X_train, ohe_sex], axis=1)
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_female,Sex_male
0,1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,0,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,1,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,1,0
4,5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,0,1


# OHE for Embarked feature

In [328]:
#One hot enconding for Embarked column
ohe_embarked = pd.get_dummies(X_train['Embarked'], prefix='Embarked').astype(bool).astype(int)
X_train.drop('Embarked', axis=1, inplace=True)
# Concatenate ohe_embarked with X_train
X_train = pd.concat([X_train, ohe_embarked], axis=1)
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,0,0,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0
2,3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,0,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,0,0,0,1
4,5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,0,0,1


# Imputation for Cabin

In [329]:
# Fill null values in Cabin column with 1
X_train['Cabin'].fillna(1, inplace=True)

# Get values of Cabin after first character
CabinNumber = X_train['Cabin'].str[1:]
CabinNumber.fillna(1, inplace=True)
X_train = pd.concat([X_train, CabinNumber], axis=1)

# Change CabinNumber column name
columnas = list(X_train.columns)
columnas[-1] = 'CabinNumber'
X_train.columns = columnas

X_train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,CabinNumber
0,1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,1,0,1,0,0,1,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0,85
2,3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,1,1,0,0,0,1,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,0,0,0,1,123
4,5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,1,0,1,0,0,1,1


In [330]:
from sklearn.impute import SimpleImputer

NanImputer = SimpleImputer(strategy = 'constant', fill_value=1)
mean_imputer= SimpleImputer(strategy='mean')


#Training Dataset
X_train = X_train.replace({'Cabin': r'A\w*'}, {'Cabin': 'A'}, regex=True) 
X_train = X_train.replace({'Cabin': r'B\w*'}, {'Cabin': 'B'}, regex=True)
X_train = X_train.replace({'Cabin': r'C\w*'}, {'Cabin': 'C'}, regex=True)
X_train = X_train.replace({'Cabin': r'D\w*'}, {'Cabin': 'D'}, regex=True)
X_train = X_train.replace({'Cabin': r'E\w*'}, {'Cabin': 'E'}, regex=True)
X_train = X_train.replace({'Cabin': r'F\w*'}, {'Cabin': 'F'}, regex=True)
X_train = X_train.replace({'Cabin': r'G\w*'}, {'Cabin': 'G'}, regex=True)
X_train = X_train.replace({'Cabin': r'T\w*'}, {'Cabin': 'T'}, regex=True)

X_train[['Cabin']]=NanImputer.fit_transform(X_train[['Cabin']])
X_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,CabinNumber
0,1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,1,0,1,0,0,1,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C,1,0,1,0,0,85
2,3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,1,1,0,0,0,1,1
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C,1,0,0,0,1,123
4,5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,1,0,1,0,0,1,1
5,6,0.0,3,"Moran, Mr. James",,0,0,330877,8.4583,1,0,1,0,1,0,1
6,7,0.0,1,"McCarthy, Mr. Timothy J",54.0,0,0,17463,51.8625,E,0,1,0,0,1,46
7,8,0.0,3,"Palsson, Master. Gosta Leonard",2.0,3,1,349909,21.075,1,0,1,0,0,1,1
8,9,1.0,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0,0,2,347742,11.1333,1,1,0,0,0,1,1
9,10,1.0,2,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,1,0,237736,30.0708,1,1,0,1,0,0,1


In [331]:
#Training DATA

ohe_Cabin = pd.get_dummies(X_train['Cabin'], prefix='ohe_Cabin')
X_train.drop('Cabin', axis=1, inplace=True)
X_train = pd.concat([X_train, ohe_embarked, ohe_Cabin], axis=1)


## Age Feature

In [332]:
# Mean imputation for Age feature - DATA
X_train[['Age']]=mean_imputer.fit_transform(X_train[['Age']])

## Fair Feature

In [333]:
# Mean imputation for Fair feature - DATA
X_train[['Fare']]=mean_imputer.fit_transform(X_train[['Fare']])

In [334]:
print(" \nCount total NaN at each column in a DataFrame : \n\n",
      X_train.isnull().sum())
X_train.shape

 
Count total NaN at each column in a DataFrame : 

 PassengerId            0
Survived             418
Pclass                 0
Name                   0
Age                    0
SibSp                  0
Parch                  0
Ticket                 0
Fare                   0
Sex_female             0
Sex_male               0
Embarked_C             0
Embarked_Q             0
Embarked_S             0
CabinNumber            0
Embarked_C             0
Embarked_Q             0
Embarked_S             0
ohe_Cabin_1            0
ohe_Cabin_A            0
ohe_Cabin_B            0
ohe_Cabin_B B          0
ohe_Cabin_B B B        0
ohe_Cabin_B B B B      0
ohe_Cabin_C            0
ohe_Cabin_C C          0
ohe_Cabin_C C C        0
ohe_Cabin_D            0
ohe_Cabin_D D          0
ohe_Cabin_E            0
ohe_Cabin_E E          0
ohe_Cabin_F            0
ohe_Cabin_F E          0
ohe_Cabin_F G          0
ohe_Cabin_G            0
ohe_Cabin_T            0
dtype: int64


(1309, 36)

In [336]:
X_train_dtypes = X_train.dtypes
X_train_dtypes


# Selecciona todas las columnas de X_train tipo bool
bool_columns = X_train.select_dtypes(include=['bool']).columns

# Tranform all columns from X_train with boolean data type to int
X_train[bool_columns] = X_train[bool_columns].astype(int)



X_train.head()



Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,CabinNumber,Embarked_C.1,Embarked_Q.1,Embarked_S.1,ohe_Cabin_1,ohe_Cabin_A,ohe_Cabin_B,ohe_Cabin_B B,ohe_Cabin_B B B,ohe_Cabin_B B B B,ohe_Cabin_C,ohe_Cabin_C C,ohe_Cabin_C C C,ohe_Cabin_D,ohe_Cabin_D D,ohe_Cabin_E,ohe_Cabin_E E,ohe_Cabin_F,ohe_Cabin_F E,ohe_Cabin_F G,ohe_Cabin_G,ohe_Cabin_T
0,1,0.0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,0,1,0,0,85,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,3,1.0,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,1,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,0,0,0,1,123,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,5,0.0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
