# Importing Libraries

In [16]:
from flask import Flask, render_template, request
import joblib
import numpy as np;
import pandas as pd;
from sklearn.model_selection import train_test_split;
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer;
from sklearn.compose import ColumnTransformer;
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor;
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import cross_val_score;


# Defining our dataset

In [17]:
Dataset = pd.read_csv('titanic.csv')

# Defining Feature x and Target y 

In [18]:
Dataset['Sex'] = Dataset['Sex'].map({'male':0 , 'female':1})
x = Dataset[['Pclass','Sex','Age','Fare','Cabin']]
y = Dataset['Survived']

In [19]:
numerical_features = ['Pclass', 'Age', 'Fare']
categorical_features = ['Sex', 'Cabin']

# Creating Transformer by Pipeline and SimpleImputer to convert non Int or Nan to mean or most frequent way
# Combine transformers using ColumnTransformer

In [20]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

 

# Creating pipeline for the Classifier

In [21]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42))
])

In [22]:
x.dtypes

Pclass      int64
Sex         int64
Age       float64
Fare      float64
Cabin      object
dtype: object

In [23]:
y

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

# Splitting the dataset

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

# Applying Classifier and testing the accuracy

In [25]:
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

1.0

In [26]:
cv_scores = cross_val_score(clf, x, y, cv=5, scoring='accuracy')
recall = cross_val_score(clf, x, y, cv=5, scoring='recall')
f1 = cross_val_score(clf, x, y, cv=5, scoring='f1')
roc_auc = cross_val_score(clf, x, y, cv=5, scoring='roc_auc')

In [27]:
cv_scores
recall
f1
roc_auc

array([1., 1., 1., 1., 1.])

In [28]:
f1

array([1., 1., 1., 1., 1.])

In [29]:
joblib.dump(clf, 'titanic_model.pkl')

['titanic_model.pkl']