In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load the Dataset

In [None]:
titanic = sns.load_dataset('titanic')

print first 5 rows of the dataset

In [None]:
titanic.head()

Count the number of rows and columns in the data set

In [None]:
titanic.shape

Get statistics on numerical columns

In [None]:
titanic.describe()

Get statistics on categorical columns

In [None]:
titanic.describe(include=[np.object])

Get statistics on all columns

In [None]:
titanic.describe(include='all')

Get count of number of survivors

In [None]:
titanic.survived.value_counts()

Visualize the count of Survivors

In [None]:
sns.countplot(titanic.survived)

In [None]:
titanic.columns

Visualize the count of survivors for columns 'who' 'sex' 'pclass' 'sibsp' 'parch' 'embarked'

In [None]:
cols = ['who','sex','pclass','sibsp','parch','embarked']

n_rows = 2
n_cols = 3

# Subplot grid and figure size of each plot

fig, axis = plt.subplots(n_rows, n_cols, figsize=(n_cols*3.2,n_rows*3.2))

for r in range(0,n_rows):
    for c in range(0,n_cols):
        i = r*n_cols+c # index to go through the number of columns
        ax = axis[r][c] # Show where to position each sub-plot
        sns.countplot(titanic[cols[i]], hue=titanic['survived'], ax = ax)
        ax.set_title(cols[i])
        ax.legend(title='Survived', loc='upper right')
        
plt.tight_layout()
        

Survival rate by sex

In [None]:
titanic.groupby('sex')[['survived']].mean()

Survival rate by sex and class

In [None]:
titanic.pivot_table('survived',index='sex',columns=['class'])

Visualize Survival rate by sex and class

In [None]:
titanic.pivot_table('survived',index='sex',columns=['class']).plot()

Visualize survival rate of each class

In [None]:
sns.barplot(x='class',y='survived',data=titanic)

Survival rate by age, sex and class

In [None]:
age = pd.cut(titanic['age'],[0,18,80])
titanic.pivot_table('survived',['sex',age],'class')

visualize Prices of each class

In [None]:
plt.scatter(titanic['fare'],titanic['class'],color='brown',label='Passenger Paid')
plt.show()

Check empty values count in each column

In [None]:
titanic.isna().sum()

In [None]:
for val in titanic:
    print(titanic[val].value_counts())
    print()

Drop columns

In [None]:
titanic.drop(['deck','alive','class','embark_town','who','alone','adult_male'],axis=1,inplace=True)

In [None]:
titanic.head()

In [None]:
# Remove rows with missing values
titanic.dropna(subset=['embarked','age'],inplace=True)

In [None]:
titanic.isna().sum()

In [None]:
titanic.shape

Look at the data types

In [None]:
titanic.dtypes

In [None]:
titanic.head()

In [None]:
# print the unique values in sex and embarked columns
print(titanic['sex'].unique())
print(titanic['embarked'].unique())

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

# Encode the sex column
titanic.iloc[:,2] = labelencoder.fit_transform(titanic.iloc[:,2].values)

# Encode the embarked column
titanic.iloc[:,7] = labelencoder.fit_transform(titanic.iloc[:,7].values)

In [None]:
# print the unique values in sex and embarked columns after pre-processing
print(titanic['sex'].unique())
print(titanic['embarked'].unique())

In [None]:
titanic.dtypes

Split the data into independent 'X' and dependent 'Y' variables

In [None]:
X = titanic.iloc[:,1:8].values
Y = titanic.iloc[:,0].values

Split data into 80% training and 20% testing

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

Scale the data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

Create a function with many Machine Learning models

In [None]:
def models(X_train,Y_train):
    
    # Use Logistic Regression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state=0)
    log.fit(X_train,Y_train)
    
    # Use KNeighbors
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=5,metric='minkowski', p=2)
    knn.fit(X_train,Y_train)
    
    # Use Support Vector Classifier (linear kernel)
    from sklearn.svm import SVC
    svc_lin = SVC(kernel='linear',random_state=0)
    svc_lin.fit(X_train,Y_train)
    
    # Use Support Vector Classifier (RBF kernel)
    from sklearn.svm import SVC
    svc_rbf = SVC(kernel='rbf',random_state=0)
    svc_rbf.fit(X_train,Y_train)
    
    # Use Gaussian Naive Bayes
    from sklearn.naive_bayes import GaussianNB
    gauss = GaussianNB()
    gauss.fit(X_train,Y_train)
    
    # Use Decision Tree Classifier
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion='entropy',random_state=0)
    tree.fit(X_train,Y_train)
    
    # Use Random forest Classifier
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators=10, criterion='entropy',random_state=0)
    forest.fit(X_train,Y_train)
    
    # print the training accuracy of each model
    print("[0]Logistic Regression Accuracy: ",log.score(X_train,Y_train))
    print("[1]KNeighbors Accuracy: ",knn.score(X_train,Y_train))
    print("[2]Support Vector Classifier (linear kernel) Accuracy: ",svc_lin.score(X_train,Y_train))
    print("[3]Support Vector Classifier (RBF kernel) Accuracy: ",svc_rbf.score(X_train,Y_train))
    print("[4]Gaussian Naive Bayes Accuracy: ",gauss.score(X_train,Y_train))
    print("[5]Decision Tree Classifier Accuracy: ",tree.score(X_train,Y_train))
    print("[6]Random forest Classifier Accuracy: ",forest.score(X_train,Y_train))
    
    return log, knn, svc_lin, svc_rbf, gauss, tree, forest
    

Get and Train all of the models

In [None]:
model = models(X_train,Y_train)

Show the Confusion Matrix and accuracy for all the models on the test data

In [None]:
from sklearn.metrics import confusion_matrix

for i in range(len(model)):
    cm = confusion_matrix(Y_test,model[i].predict(X_test))
    
    # Extract TP, FP, TN, FN
    TN, FP, FN, TP = confusion_matrix(Y_test,model[i].predict(X_test)).ravel()
    
    test_score = (TP + TN) / (TP+TN+FP+FN)
    
    print(cm)
    print(f"model[{i}] Testing Accuracy = {test_score}")
    print()

Get Feature Importance

In [None]:
forest = model[6]
importances = pd.DataFrame({'feature':titanic.iloc[:,1:8].columns, 'importance': np.round(forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances

Visualize the importance

In [None]:
importances.plot.bar()

Print the prediction of the Random Forest Classifier

In [None]:
pred = model[6].predict(X_test)
print(pred)
print()

# Print the actual Values
print(Y_test)

In [None]:
# My Survival
my_survival = [[1,0,29,1,1,500,0]]

# Scaling My Survival
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
my_survival_scaled = sc.fit_transform(my_survival)

# Print Prediction of My Survival using Random Forest classifier
pred = model[6].predict(my_survival_scaled)
print(pred)

if pred == 0:
    print("Oh No!! Did not Survive")
else:
    print("Yes!! Survived")