In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [25]:
df = pd.read_csv("titanic.csv")
df=df.drop(['Cabin','Name','Ticket','PassengerId'],axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [26]:
df['Fmembers'] = df.iloc[:, 4:6].sum(axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Fmembers
0,0,3,male,22.0,1,0,7.25,S,1
1,1,1,female,38.0,1,0,71.2833,C,1
2,1,3,female,26.0,0,0,7.925,S,0
3,1,1,female,35.0,1,0,53.1,S,1
4,0,3,male,35.0,0,0,8.05,S,0


In [27]:
df =df.drop(["SibSp", "Parch"], axis=1)

In [28]:
df['Embarked'] = df.Embarked.fillna(df['Embarked'].mode()[0])
df['Embarked'] = df.Embarked.replace({'S':0, 'Q': 1, 'C': 2})
df['Age'] = df.Age.fillna(df['Age'].mean())

  df['Embarked'] = df.Embarked.replace({'S':0, 'Q': 1, 'C': 2})


# nurmalizing dataset

In [29]:
data = pd.get_dummies(df, columns=['Sex'])
data=data.drop(['Sex_female'], axis=1)
data.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Embarked,Fmembers,Sex_male
0,0,3,22.0,7.25,0,1,True
1,1,1,38.0,71.2833,2,1,False
2,1,3,26.0,7.925,0,0,False
3,1,1,35.0,53.1,0,1,False
4,0,3,35.0,8.05,0,0,True


# splitting dataset

In [30]:
X = data.iloc[:,1:]
y = data.iloc[:,0]

In [31]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 6), (179, 6), (712,), (179,))

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from jcopml.tuning import grid_search_params as gsp

In [33]:

pipeline = Pipeline([

    ('algo', RandomForestClassifier(n_jobs=-1, random_state=42))
])

model = GridSearchCV(pipeline, gsp.rf_params, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)
print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 81 candidates, totalling 243 fits
{'algo__max_depth': 20, 'algo__max_features': 0.8, 'algo__min_samples_leaf': 5, 'algo__n_estimators': 150}
0.8904494382022472 0.8286647992530346 0.7821229050279329


In [34]:
X_train.head(1)

Unnamed: 0,Pclass,Age,Fare,Embarked,Fmembers,Sex_male
692,3,29.699118,56.4958,0,0,True


In [35]:
X_t = [[1, 30, 50, 0, 0, "1"]]
X_pred = pd.DataFrame(X_t, columns=X.columns)
X_pred

Unnamed: 0,Pclass,Age,Fare,Embarked,Fmembers,Sex_male
0,1,30,50,0,0,1


In [36]:
model.predict(X_pred)

array([0], dtype=int64)

In [39]:
survived = []
if model.predict(X_pred) == 0:
    print('survived')
else:
    print('unsurvived')

survived
