In [131]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

rand=0 #random state

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Load files

In [132]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')

test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

train_df=train_df.set_index('PassengerId')
train_df.head(10)

Cabin, Name and Ticket attributes are going to be deleted.

In [133]:
train_df = train_df.drop(['Cabin','Name', 'Ticket'],axis=1)

In [134]:
train_df.info()

There are missing values on some attributes.

In [135]:
train_df.describe()

<h4>Splitting data</h4>
In this case, we are creating a validation set from the train set.

In [136]:
from sklearn.model_selection import train_test_split

train_set, val_set= train_test_split(train_df,test_size=0.3,random_state=rand)


<h3>EDA</h3>

In [137]:
train_set.hist(figsize=(10,10),grid=False)

In [138]:
train_set['Sex'].value_counts().plot(kind='bar')

In [139]:
train_set['Embarked'].value_counts().plot(kind='bar')

<h3>Prepare Data</h3>

In [140]:
train_X = train_set.drop('Survived',axis=1)
train_y = train_set['Survived']

val_X = val_set.drop('Survived', axis=1)
val_y= val_set['Survived']

In [141]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')) ,
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

In [142]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attr = ['Pclass','Age','SibSp','Parch','Fare']
cat_attr = ['Embarked','Sex']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attr),
    ('cat', cat_pipeline,cat_attr)
])

In [143]:
train_X_final = full_pipeline.fit_transform(train_X)

<h3>Training models</h3>

In [144]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

forest_model = RandomForestClassifier(random_state=rand)
forest_model.fit(train_X_final,train_y)

val_X_final= full_pipeline.transform(val_X)

predict_val = forest_model.predict(val_X_final)
print(accuracy_score(val_y,predict_val)*100,'%')

In [145]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(learning_rate_init=0.001,max_iter=1000,random_state=rand)
mlp_model.fit(train_X_final,train_y)

predict_val = mlp_model.predict(val_X_final)
print(accuracy_score(val_y,predict_val)*100,'%')

We are going to use RandomForestClassifier for the time being

<h3>Hyper-Parameter Tuning</h3>
Using GridSearch

In [146]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[10,100,1000],'max_features':['sqrt',1.0], 'max_depth':[None,10,100,1000]}
]

forest = RandomForestClassifier()
grid = GridSearchCV(forest,param_grid,cv=5,scoring='accuracy',return_train_score=True)
grid.fit(train_X_final,train_y)

In [147]:
results = grid.cv_results_
for m,p in zip(results['mean_test_score'],results['params']):
    print(m,p)

In [148]:
grid.best_params_

Let's see which features are more important.

In [149]:
feature_weight = grid.best_estimator_.feature_importances_
cat_attr_pipe = full_pipeline.named_transformers_['cat'][1].categories_


cat_attribs = np.array([])
for attr in cat_attr_pipe:
    cat_attribs = np.append(cat_attribs,attr)
    
cat_attribs = list(cat_attribs)
attr = num_attr + cat_attribs

sorted(zip(feature_weight,attr), reverse = True)


In [150]:
full_X = train_df.drop('Survived',axis=1)
y = train_df['Survived']
X= full_pipeline.transform(full_X)
test_X=test_df[full_X.columns]
test_X_final= full_pipeline.transform(test_X)

full_forest_model =grid.best_estimator_
full_forest_model.fit(X,y)



predict_test = full_forest_model.predict(test_X_final)

In [151]:
submission = pd.DataFrame({'PassengerId':test_df.PassengerId,'Survived':predict_test})
submission.to_csv('submission.csv',index=False)