In [1]:
import pandas as pd

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Preprocessing
## Fill missing data

In [2]:
missing_values_train = train_data.isnull().sum()
missing_values_test = test_data.isnull().sum()
print('Train data')
print(missing_values_train)
print('\nTest data')
print(missing_values_test)

Train data
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Test data
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [3]:
def fill(df):
    return df.fillna({'Age': 0, 'Fare': 0, 'Cabin': 'Z0', 'Embarked': 'Z'})

train_data_processed = fill(train_data)
test_data_processed = fill(test_data)

train_data_processed.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Z0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Z0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Z0,S


## Process Name column
I extract the length of the name, as well as the title (Mr., Mrs., Miss., or None)

In [4]:
def extract_title(name):
    if 'Mr.' in name:
        return 'Mr.'
    elif 'Mrs.' in name:
        return 'Mrs.'
    elif 'Miss.' in name:
        return 'Miss.'
    else:
        return 'None'

train_data_processed['Name Length'] = train_data_processed['Name'].str.len()
train_data_processed['Title'] = train_data_processed['Name'].apply(extract_title)
train_data_processed.drop(columns=['Name'], inplace=True)

test_data_processed['Name Length'] = test_data_processed['Name'].str.len()
test_data_processed['Title'] = test_data_processed['Name'].apply(extract_title)
test_data_processed.drop(columns=['Name'], inplace=True)

train_data_processed.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name Length,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,Z0,S,23,Mr.
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,51,Mrs.
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,Z0,S,22,Miss.
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,44,Mrs.
4,5,0,3,male,35.0,0,0,373450,8.05,Z0,S,24,Mr.


## Process Ticket column
The Ticket column is split into one column containing the "prefix" (made up of letters and numbers) and one containing the "number" (made up only of numbers). If the original value had no prefix, a 'Z' prefix is added; if it had no number, a 0 is added.

In [5]:
def edit_ticket_value(val):
    if ' ' in val:
        return val
    if val.isdigit():
        return 'Z ' + val
    else:
        return val + ' 0'
    
train_data_processed['Ticket'] = train_data_processed['Ticket'].apply(edit_ticket_value)
train_data_processed[['Ticket Prefix', 'Ticket Number Str']] = train_data_processed['Ticket'].str.rsplit(' ', n=1, expand=True)
train_data_processed['Ticket Number'] = train_data_processed['Ticket Number Str'].astype(int)
train_data_processed.drop(columns=['Ticket', 'Ticket Number Str'], inplace=True)

test_data_processed['Ticket'] = test_data_processed['Ticket'].apply(edit_ticket_value)
test_data_processed[['Ticket Prefix', 'Ticket Number Str']] = test_data_processed['Ticket'].str.rsplit(' ', n=1, expand=True)
test_data_processed['Ticket Number'] = test_data_processed['Ticket Number Str'].astype(int)
test_data_processed.drop(columns=['Ticket', 'Ticket Number Str'], inplace=True)

train_data_processed.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Name Length,Title,Ticket Prefix,Ticket Number
0,1,0,3,male,22.0,1,0,7.25,Z0,S,23,Mr.,A/5,21171
1,2,1,1,female,38.0,1,0,71.2833,C85,C,51,Mrs.,PC,17599
2,3,1,3,female,26.0,0,0,7.925,Z0,S,22,Miss.,STON/O2.,3101282
3,4,1,1,female,35.0,1,0,53.1,C123,S,44,Mrs.,Z,113803
4,5,0,3,male,35.0,0,0,8.05,Z0,S,24,Mr.,Z,373450


## Process Cabin column
If multiple cabins are indicated, only the first one is considered. This cabin's code is then split into the letter prefix and the number. If no number is present, a 0 is added.

In [6]:
def edit_cabin_value(val):
    if len(val) < 2:
        return val + '0'
    else:
        return val
    
train_data_processed['Cabin'] = train_data_processed['Cabin'].str.split(' ', n=1, expand=True)[0]
train_data_processed['Cabin'] = train_data_processed['Cabin'].apply(edit_cabin_value)
train_data_processed['Cabin Prefix'] = train_data_processed['Cabin'].str[0]
train_data_processed['Cabin Number'] = train_data_processed['Cabin'].str[1:].astype(int)
train_data_processed.drop(columns=['Cabin'], inplace=True)

test_data_processed['Cabin'] = test_data_processed['Cabin'].str.split(' ', n=1, expand=True)[0]
test_data_processed['Cabin'] = test_data_processed['Cabin'].apply(edit_cabin_value)
test_data_processed['Cabin Prefix'] = test_data_processed['Cabin'].str[0]
test_data_processed['Cabin Number'] = test_data_processed['Cabin'].str[1:].astype(int)
test_data_processed.drop(columns=['Cabin'], inplace=True)

train_data_processed.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Name Length,Title,Ticket Prefix,Ticket Number,Cabin Prefix,Cabin Number
0,1,0,3,male,22.0,1,0,7.25,S,23,Mr.,A/5,21171,Z,0
1,2,1,1,female,38.0,1,0,71.2833,C,51,Mrs.,PC,17599,C,85
2,3,1,3,female,26.0,0,0,7.925,S,22,Miss.,STON/O2.,3101282,Z,0
3,4,1,1,female,35.0,1,0,53.1,S,44,Mrs.,Z,113803,C,123
4,5,0,3,male,35.0,0,0,8.05,S,24,Mr.,Z,373450,Z,0


# Machine learning
## Create the arrays for machine learning

In [7]:
from sklearn.model_selection import train_test_split

y_train = train_data_processed['Survived']
X_train_unencoded = train_data_processed.drop(columns=['PassengerId', 'Survived'])

X_test_unencoded = test_data_processed.drop(columns=['PassengerId'])

X_concat_unencoded = pd.concat([X_train_unencoded, X_test_unencoded], axis=0, ignore_index=True)
X_concat = pd.get_dummies(X_concat_unencoded)
X_train = X_concat.iloc[:len(X_train_unencoded)]
X_test = X_concat.iloc[len(X_train_unencoded):]



In [8]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name Length,Ticket Number,Cabin Number,Sex_female,Sex_male,...,Ticket Prefix_Z,Cabin Prefix_A,Cabin Prefix_B,Cabin Prefix_C,Cabin Prefix_D,Cabin Prefix_E,Cabin Prefix_F,Cabin Prefix_G,Cabin Prefix_T,Cabin Prefix_Z
0,3,22.0,1,0,7.25,23,21171,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,1,38.0,1,0,71.2833,51,17599,85,1,0,...,0,0,0,1,0,0,0,0,0,0
2,3,26.0,0,0,7.925,22,3101282,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,1,35.0,1,0,53.1,44,113803,123,1,0,...,1,0,0,1,0,0,0,0,0,0
4,3,35.0,0,0,8.05,24,373450,0,0,1,...,1,0,0,0,0,0,0,0,0,1


In [9]:
X_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Name Length,Ticket Number,Cabin Number,Sex_female,Sex_male,...,Ticket Prefix_Z,Cabin Prefix_A,Cabin Prefix_B,Cabin Prefix_C,Cabin Prefix_D,Cabin Prefix_E,Cabin Prefix_F,Cabin Prefix_G,Cabin Prefix_T,Cabin Prefix_Z
891,3,34.5,0,0,7.8292,16,330911,0,0,1,...,1,0,0,0,0,0,0,0,0,1
892,3,47.0,1,0,7.0,32,363272,0,1,0,...,1,0,0,0,0,0,0,0,0,1
893,2,62.0,0,0,9.6875,25,240276,0,0,1,...,1,0,0,0,0,0,0,0,0,1
894,3,27.0,0,0,8.6625,16,315154,0,0,1,...,1,0,0,0,0,0,0,0,0,1
895,3,22.0,1,1,12.2875,44,3101298,0,1,0,...,1,0,0,0,0,0,0,0,0,1


## Evaluate random forest performance with cross validation

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)

scores = cross_val_score(forest, X_train, y_train, cv=5)

print('Cross-validation scores:', scores)
print('Average:', scores.mean())

Cross-validation scores: [0.83240223 0.81460674 0.83707865 0.80337079 0.83707865]
Average: 0.8249074132195092


## Build random forest and make predictions

In [11]:
full_forest = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
full_forest.fit(X_train, y_train)
predictions = full_forest.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)