In [11]:
#import necessary modules
#find out whether a person survived the titanic or not depending on the different features in the dataset
#use a machine learning model 
import pandas as pd

In [12]:
#read your csv
df = pd.read_csv("/Users/leahsumajit/Downloads/py-master/ML/9_decision_tree/Exercise/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
#start preprocessing
#drop variables that are unnecessary
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis = 'columns', inplace = True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [14]:
#Define target variable and drop it from the dataset to have dependent and independent variables separate
target = df.Survived
inputs = df.drop('Survived', axis = 'columns')

In [15]:
#convert Sex column into 2 different columns. 1 Represents Yes and 0 represents no
#machine learning models don't accept strings
dummies = pd.get_dummies(inputs.Sex)
dummies.head(3)

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0


In [16]:
#append dummies columns and input into pandas dataframe
inputs = pd.concat([inputs, dummies], axis = 'columns')
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0


In [20]:
#drop the Sex column since we already have two columns that represents whether a person is a male or female according to their number
inputs.drop('Sex', axis = 'columns', inplace = True)
inputs.head(3)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0


In [21]:
#find out any missing values/NaNs 
inputs.columns[inputs.isna().any()]

Index([], dtype='object')

In [22]:
#Age column appears to have NaN
#check the first 10 values of the column
inputs.Age[:10]

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: Age, dtype: float64

In [24]:
#fill missing values with mean 
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head(6)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1
5,3,29.699118,8.4583,0,1


In [34]:
#use train_test_split to separate training and testing datset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size = 0.2)

In [35]:
#find the length of your training dataset 
len(X_train)

712

In [36]:
#find the length of your testing dataset 
len(X_test)

179

In [37]:
#find the length of your dataset with training and testing dataset 
len(inputs)

891

In [38]:
#import your desired model
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [39]:
#fit your model
model.fit(X_train, y_train)

GaussianNB()

In [40]:
#find the accuracy score of your model
model.score(X_test, y_test)

0.8212290502793296

In [41]:
#show the first 10 rows of X_test(dependent variables/features)
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female,male
431,3,29.699118,16.1,1,0
271,3,25.0,0.0,0,1
618,2,4.0,39.0,1,0
370,1,25.0,55.4417,0,1
453,1,49.0,89.1042,0,1
283,3,19.0,8.05,0,1
293,3,24.0,8.85,1,0
142,3,24.0,15.85,1,0
413,2,29.699118,0.0,0,1
609,1,40.0,153.4625,1,0


In [42]:
#show the first 10 rows of y_test(target variable)
y_test[:10]

431    1
271    1
618    1
370    1
453    1
283    1
293    0
142    1
413    0
609    1
Name: Survived, dtype: int64

In [44]:
#make a prediction
#output will be survived or not 
#Survived = 1, Did not Survive = 0
#show the first 10 rows of our prediction
model.predict(X_test[:10])

array([1, 0, 1, 0, 0, 0, 1, 1, 0, 1])

In [45]:
#show the first 10 rows of the probability of your prediction
model.predict_proba(X_test[:10])

array([[7.96122655e-02, 9.20387735e-01],
       [9.86793479e-01, 1.32065205e-02],
       [1.78926620e-02, 9.82107338e-01],
       [8.57281316e-01, 1.42718684e-01],
       [6.14334127e-01, 3.85665873e-01],
       [9.86357947e-01, 1.36420531e-02],
       [7.29696687e-02, 9.27030331e-01],
       [7.42399142e-02, 9.25760086e-01],
       [9.72957847e-01, 2.70421526e-02],
       [9.54370182e-06, 9.99990456e-01]])