In [1]:
import pandas as pd
import numpy as np
columns = ['Survived', 'Sex', 'Age', 'Fare', 'SibSp']

In [2]:
df = pd.read_csv('titanic.csv', usecols=columns)

In [3]:
df.head(5)

Unnamed: 0,Survived,Sex,Age,SibSp,Fare
0,0,male,22.0,1,7.25
1,1,female,38.0,1,71.2833
2,1,female,26.0,0,7.925
3,1,female,35.0,1,53.1
4,0,male,35.0,0,8.05


In [4]:
df.isnull().sum()

Survived      0
Sex           0
Age         177
SibSp         0
Fare          0
dtype: int64

In [5]:
len(df)

891

In [6]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [7]:
df.isnull().sum()

Survived    0
Sex         0
Age         0
SibSp       0
Fare        0
dtype: int64

In [8]:
df.head(5)

Unnamed: 0,Survived,Sex,Age,SibSp,Fare
0,0,male,22.0,1,7.25
1,1,female,38.0,1,71.2833
2,1,female,26.0,0,7.925
3,1,female,35.0,1,53.1
4,0,male,35.0,0,8.05


In [9]:
y = df['Survived']
features = df.columns[ df.columns != 'Survived']
features

Index(['Sex', 'Age', 'SibSp', 'Fare'], dtype='object')

In [10]:
gender = {'male': 1, 'female': 0}
df['Sex'] = df['Sex'].map(gender)

In [11]:
df.head(5)

Unnamed: 0,Survived,Sex,Age,SibSp,Fare
0,0,1,22.0,1,7.25
1,1,0,38.0,1,71.2833
2,1,0,26.0,0,7.925
3,1,0,35.0,1,53.1
4,0,1,35.0,0,8.05


In [12]:
X = df[features]

In [13]:
X.head(5)

Unnamed: 0,Sex,Age,SibSp,Fare
0,1,22.0,1,7.25
1,0,38.0,1,71.2833
2,0,26.0,0,7.925
3,0,35.0,1,53.1
4,1,35.0,0,8.05


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, random_state=123, shuffle=True, test_size=0.3)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

In [27]:
from sklearn.metrics import accuracy_score, precision_score, log_loss, f1_score

models =  {
    "Logistic Regression": LogisticRegression(C=100.0, penalty='l2', solver='liblinear', multi_class='ovr'),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(criterion='entropy', n_estimators=100),
    "Ada Boost Classifier": AdaBoostClassifier(),
    "XGB BOOST CLASSIFIER": XGBClassifier()
}


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Name: {name}")
    print(f"Score for Trainig {model.score(X_train, y_train)}")
    print(f"Score for Testing {model.score(X_test, y_test)}")
    print(f"Accuracy score {accuracy_score(y_test, y_pred)}")
    print(f"F1 score {f1_score(y_test, y_pred)}")
    print(f"Precision score {precision_score(y_test, y_pred)}")
    print(f"Log loss score {log_loss(y_test, y_pred)}")
    print("#######\n")

Name: Logistic Regression
Score for Trainig 0.7897271268057785
Score for Testing 0.7910447761194029
Accuracy score 0.7910447761194029
F1 score 0.702127659574468
Precision score 0.7333333333333333
Log loss score 7.531509663397614
#######

Name: Decision Tree Classifier
Score for Trainig 0.985553772070626
Score for Testing 0.753731343283582
Accuracy score 0.753731343283582
F1 score 0.6826923076923077
Precision score 0.6454545454545455
Log loss score 8.876422103290045
#######

Name: Random Forest Classifier
Score for Trainig 0.985553772070626
Score for Testing 0.8059701492537313
Accuracy score 0.8059701492537313
F1 score 0.7346938775510203
Precision score 0.7346938775510204
Log loss score 6.993544687440641
#######

Name: Ada Boost Classifier
Score for Trainig 0.8282504012841091
Score for Testing 0.8171641791044776
Accuracy score 0.8171641791044776
F1 score 0.7487179487179487
Precision score 0.7525773195876289
Log loss score 6.590070955472913
#######

Name: XGB BOOST CLASSIFIER
Score for T

In [28]:
lr = models["Random Forest Classifier"].fit(X_train, y_train)

te = np.array([[1, 23.0, 2, 20.52]])
p_te = lr.predict(te)
p_te

array([0])

In [29]:
import pickle
data = {"model": lr}
with open('survivor.pkl', 'wb') as file:
    pickle.dump(data, file)

In [30]:
with open('survivor.pkl', 'rb') as file:
    n_model = pickle.load(file)
loaded = n_model["model"]

In [31]:
te = np.array([[0, 38.0, 1, 71.2833]])
p_t = loaded.predict(te)
p_t

array([1])