In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
full_set = [train_data,test_data]
train_data.head()

for dataset in full_set:
    dataset['Alone'] = 0
    dataset['FamilySize'] = dataset['Parch'] + dataset['SibSp'] + 1
    dataset.loc[dataset['FamilySize'] == 1, 'Alone'] = 1
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    avg_age = dataset['Age'].mean()
    std_age = dataset['Age'].std()
    null_count_age = dataset['Age'].isnull().sum()
    age_null_list = np.random.randint(avg_age - std_age,avg_age + std_age, size = null_count_age)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_list
    
    dataset['Age'] = dataset['Age'].astype(int)
    dataset['Fare'] = dataset['Fare'].fillna(train_data['Fare'].median())

train_data['Age_Categories'] = pd.cut(train_data['Age'],5)
    
    
for dataset in full_set:
    print(dataset.columns[dataset.isnull().any()])
##Missing Values
print(train_data[['Age_Categories', 'Survived']].groupby(['Age_Categories'], as_index=False).mean())

train_data.head()



In [None]:
def get_title(name):
    title = re.search(' ([A-Za-z]+)\.', name)
    if(title):
        return title.group(1)
    else:
        return ""
    
for dataset in full_set:
    dataset['Title'] = dataset['Name'].apply(get_title)
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'VIP')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
print(pd.crosstab(train_data['Title'], train_data['Sex']))
print (train_data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
train_data.head()
print(train_data['Sex'])

In [None]:
#clean data and map to numerical values
for dataset in full_set:
    
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} )

    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} )
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[dataset['Age'] > 64, 'Age'] = 4
    

In [None]:
for dataset in full_set:
    dataset['Sex'] = dataset['Sex'].astype(int)
    dataset['Age'] = dataset['Age'].astype(int)
    dataset['Title'] = dataset['Title'].astype(int)
    dataset['Fare'] = dataset['Fare'].astype(int)
    dataset['Age'] = dataset['Age'].astype(int)

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV
#setup model and hyperparameter optimization
param_grid = {
    'n_estimators': range(8, 20),
    'max_depth': range(6, 10),
    'learning_rate': [.4, .45, .5, .55, .6],
    'colsample_bytree': [.6, .7, .8, .9, 1]
}

X_train = train_data[['Pclass',  'Sex',  'Age',  'Fare',  'Embarked',  'Alone',  'Title']]
y_train = train_data[['Survived']]


regressor = XGBClassifier(n_estimators = 10)

xgb_random = RandomizedSearchCV(param_distributions=param_grid, 
                                    estimator = regressor, scoring = "accuracy", 
                                    verbose = 1, n_iter = 50, cv = 4)
xgb_random.fit(X_train,y_train)
print("Best parameters found: ", xgb_random.best_params_)
print("Best accuracy found: ", xgb_random.best_score_)
#Best parameters found:  {'n_estimators': 17, 'max_depth': 7, 'learning_rate': 0.55, 'colsample_bytree': 1}
#Best accuracy found:  0.8238344847089242


In [None]:
from sklearn.metrics import mean_absolute_error
predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

In [None]:
print(train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
print(train_data[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean())
print(train_data[["Embarked", "Survived"]].groupby(['Embarked'], as_index=False).mean())
print(train_data[['Alone', 'Survived']].groupby(['Alone'], as_index=False).mean())
print(train_data[['Fare', 'Survived']].groupby(['Fare'], as_index=False).mean())


In [None]:

train_data.loc[train_data['Cabin'].isna()]