# **Titanic Passenger Survival Prediction**
### **Predicting Passenger Survival on the Titanic using Decision Tree Algorithm**
## **Mercy Consolate Akello**
## **2024-01-11**

## **Load data**

In [None]:
library(readr)
titanic <- read_csv("~/Active Documents/My Documents/CodSoft/Data Science Internship/datasets/titanic.csv")

### **Check for missing data**

In [None]:
colSums(is.na(titanic))

### **Check for duplicates**

In [None]:
duplicated(titanic)

### **Remove unnecessary variables**

In [None]:
library(dplyr)
titanic=titanic%>%
  select(-c('PassengerId','Name','Ticket','Cabin'))


### **Convert data type**

In [None]:
titanic$Survived=as.factor(titanic$Survived)
titanic$Pclass=as.factor(titanic$Pclass)
titanic$Sex=as.factor(titanic$Sex)
titanic$Embarked=as.factor(titanic$Embarked)

## **Exploratory Data Analysis**

### **Numeric variable**

In [None]:
library(ggplot2)

ggplot(titanic,aes(Age))+
  geom_histogram(bins=25,fill='#03E2ED')+
  theme_classic()+
  theme(axis.text.x=element_text(size=8,face = 'bold',color='black'),
        axis.text.y=element_text(size=8,face='bold',color='black'))+
  labs(title = 'Distribution of Age',y='Count of Passengers')

In [None]:
ggplot(titanic,aes(Fare))+
  geom_histogram(bins=25,fill='#03E2ED')+
  theme_classic()+
  theme(axis.text.x=element_text(size=8,face = 'bold',color='black'),
        axis.text.y=element_text(size=8,face='bold',color='black'))+
  labs(title = 'Distribution of Fare',y='Count of Passengers')

In [None]:
ggplot(titanic,aes(Age,Fare))+
  geom_point(color='#03E2ED')+
  geom_smooth(method = 'lm', se=F,color='maroon')+
  theme_classic()+
  theme(axis.text.x=element_text(size=8,face = 'bold',color='black'),
        axis.text.y=element_text(size=8,face='bold',color='black'))+
  labs(title = 'Distribution of Age by Survived')

### **Categorical variable**

In [None]:
ggplot(titanic,aes(Survived))+
  geom_bar(fill='#03E2ED')+
  theme_classic()+
  theme(axis.text.x=element_text(size=8,face = 'bold',color='black'),
        axis.text.y=element_text(size=8,face='bold',color='black'))+
  labs(title = 'Passenger Survival',y='Count of Passengers')

In [None]:
ggplot(titanic,aes(Survived,Age))+
  geom_boxplot(fill='#03E2ED')+
  theme_classic()+
  theme(axis.text.x=element_text(size=8,face = 'bold',color='black'),
        axis.text.y=element_text(size=8,face='bold',color='black'))+
  labs(title = 'Distribution of Age by Survived')

In [None]:
ggplot(titanic,aes(Survived,Fare))+
  geom_boxplot(fill='#03E2ED')+
  theme_classic()+
  theme(axis.text.x=element_text(size=8,face = 'bold',color='black'),
        axis.text.y=element_text(size=8,face='bold',color='black'))+
  labs(title = 'Distribution of Fare by Survived')

In [None]:
ggplot(titanic,aes(Pclass,Fare))+
  geom_boxplot(fill='#03E2ED')+
  theme_classic()+
  theme(axis.text.x=element_text(size=8,face = 'bold',color='black'),
        axis.text.y=element_text(size=8,face='bold',color='black'))+
  labs(title = 'Distribution of Fare by Pclass')

In [None]:
ggplot(titanic,aes(Embarked,Fare))+
  geom_boxplot(fill='#03E2ED')+
  theme_classic()+
  theme(axis.text.x=element_text(size=8,face = 'bold',color='black'),
        axis.text.y=element_text(size=8,face='bold',color='black'))+
  labs(title = 'Distribution of Fare by Embarked')

## **Split dataset into two**

In [None]:
library(caTools)
set.seed(13579)
sample=sample.split(titanic$Survived,SplitRatio=.70)
train=subset(titanic,sample==TRUE)
test=subset(titanic,!sample==TRUE)

## **Build a Decision Tree model**

In [None]:
library(rpart)
dt.model=rpart(Survived~.,train)

## **Make a prediction using test data**

In [None]:
dt.model.test=predict(dt.model, test, type='class')

## **Model Performance Evaluation**

In [None]:
library(caret)
tad=table(test$Survived, dt.model.test)
confusionMatrix(tad)