In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

## Data Analysis and Preprocessing:

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


### Handling NaN fields:

In [7]:
values = {"Age": -1, "Cabin": 'NA', "Embarked": 'NA', "Fare": -1}

In [8]:
train = train_data.fillna(value=values)

In [9]:
test = test_data.fillna(value=values)

In [10]:
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

### Analysing attribute values for identifying categorical (and type of categorical) and numerical variables:

In [11]:
# Shows that each name and passengerId is unique, essentially indicating that 
# Name and PassengerId would be no help in classification
print(len(pd.unique(train.Name)))
print(len(pd.unique(test.Name)))

print(len(pd.unique(train.PassengerId)))
print(len(pd.unique(test.PassengerId)))

891
418
891
418


In [12]:
print(len(pd.unique(train.Ticket)))
print(len(pd.unique(test.Ticket)))

681
363


In [13]:
print(len(pd.unique(train.Fare)))
print(len(pd.unique(test.Fare)))

248
170


In [14]:
# The difference in number of unique values among these classes 
# do not leave a lot of room for considering positive correlation among them
print(len(pd.unique(train.Pclass)))
print(len(pd.unique(train.Fare)))
print(len(pd.unique(train.Cabin)))

3
248
148


In [15]:
print(len(pd.unique(train.Cabin)))
print(len(pd.unique(test.Cabin)))

print(len(pd.unique(train.Ticket)))
print(len(pd.unique(test.Ticket)))

148
77
681
363


In [16]:
# Categorical Variables to be considered: Sex, Ticket, Cabin, Embarked
# All are nominal categorical variables, cabin may be considered ordinal, however some passengers have multiple cabin values,
# using dummies

### Preparing input data:

In [17]:
# Extracting independent output variable y
y = train["Survived"]

# Setting features to be used
# features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Cabin"]


In [18]:
temp = train.copy()[features]
temp_test = test.copy()[features]

In [19]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
onehot_encoder.fit(temp)

OneHotEncoder(handle_unknown='ignore')

In [20]:

X_train = onehot_encoder.transform(temp)
X_test = onehot_encoder.transform(temp_test)

## Training:

In [21]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(max_depth=50)



In [22]:
scores = cross_val_score(model, X_train, y, cv=5)

In [23]:
scores

array([0.81005587, 0.80898876, 0.81460674, 0.80337079, 0.8258427 ])

In [24]:
model.fit(X_train, y)
model.score(X_train, y)

0.9371492704826038

## Predictions:

In [25]:
predictions = model.predict(X_test)

In [26]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
