# Problem Statement:
Develop a machine learning model to predict the survival of passengers on the Titanic based on demographic and other relevant features.

# Import libraries

In [127]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB

# Data preprocesing

In [24]:
df=pd.read_csv('Titanic-Dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Droping insignificant columns

In [25]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis=1,inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [26]:
x=df.drop('Survived',axis=1)
y=df['Survived']

Encoding categorical data

In [27]:
dummy=pd.get_dummies(df.Sex)
full_x=pd.concat([x,dummy],axis=1)

In [28]:
full_x.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [29]:
full_x.drop('Sex',axis=1,inplace=True)

In [30]:
full_x.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


Check for null values

In [32]:
full_x.isnull().sum()

Pclass      0
Age       177
Fare        0
female      0
male        0
dtype: int64

In [34]:
df.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In data set Age column has null values, here we will treating null values with it's mean.

In [35]:
full_x.Age=full_x.Age.fillna(full_x.Age.mean())

In [37]:
full_x.Age[:10]

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: Age, dtype: float64

# Data spliting

In [118]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(full_x,y,test_size=0.2)

In [119]:
x_train.shape

(712, 5)

In [120]:
x_test.shape

(179, 5)

# Model fit 

In [121]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()

In [122]:
model.fit(x_train,y_train)

In [123]:
y_test[:10]

608    1
468    0
403    0
710    1
577    1
679    1
687    0
620    0
578    0
610    0
Name: Survived, dtype: int64

predicting dependent data

In [124]:
y_test_pred=model.predict(x_test[:10])

In [125]:
y_test_pred[:10]

array([1, 0, 0, 1, 1, 1, 0, 0, 1, 1], dtype=int64)

accurecy score

In [126]:
model.score(x_train,y_train)

0.7808988764044944

# Project Description:
In this project, a predictive model was developed using the Naive Bayes algorithm to determine the likelihood of survival for passengers aboard the Titanic. The project involved preprocessing the Titanic dataset, which included tasks such as removing insignificant features, handling categorical variables, and splitting the data into features and labels.

The model was trained and evaluated using the Gaussian Naive Bayes classifier, providing insights into the key factors influencing passenger survival. This project demonstrates the application of machine learning techniques for binary classification problems.