# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import os
from xgboost import XGBClassifier
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('../input/titanic/train.csv')
test_df = pd.read_csv('../input/titanic/test.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Train Data Preprocessing

In [3]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
train_df.shape

(891, 12)

In [5]:
train_df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [6]:
del train_df['PassengerId']
del train_df['Name']
del train_df['Ticket']
del train_df['Cabin']
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [7]:
def sex_to_categorical(row):
    if row.Sex == 'male':
        return 0
    else:
        return 1

def embark_to_categorical(row):
    if row.Embarked == 'C':
        return 0
    elif row.Embarked == 'Q':
        return 1
    else:
        return 2

In [8]:
train_df['sex'] = [sex_to_categorical(train_df.loc[i]) for i in tqdm(range(train_df.shape[0]))]
train_df['embarked'] = [embark_to_categorical(train_df.loc[i]) for i in tqdm(range(train_df.shape[0]))]

100%|██████████| 891/891 [00:00<00:00, 8273.61it/s]
100%|██████████| 891/891 [00:00<00:00, 5892.09it/s]


In [9]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,sex,embarked
0,0,3,male,22.0,1,0,7.25,S,0,2
1,1,1,female,38.0,1,0,71.2833,C,1,0
2,1,3,female,26.0,0,0,7.925,S,1,2
3,1,1,female,35.0,1,0,53.1,S,1,2
4,0,3,male,35.0,0,0,8.05,S,0,2


In [10]:
del train_df['Sex']
del train_df['Embarked']
train_df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,sex,embarked
0,0,3,22.0,1,0,7.25,0,2
1,1,1,38.0,1,0,71.2833,1,0
2,1,3,26.0,0,0,7.925,1,2
3,1,1,35.0,1,0,53.1,1,2
4,0,3,35.0,0,0,8.05,0,2


# Generating Data

In [11]:
X = train_df.drop('Survived',axis=1)
Y = train_df['Survived']
print('Train data shape: {}'.format(X.shape))
print('Train labels shape: {}'.format(Y.shape))

Train data shape: (891, 7)
Train labels shape: (891,)


# Train-Validation Split

In [12]:
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size = 0.1, random_state=1)
print(f'x train shape: {x_train.shape}')
print(f'y train shape: {y_train.shape}')
print(f'x val shape: {x_val.shape}')
print(f'y val shape: {y_val.shape}')

x train shape: (801, 7)
y train shape: (801,)
x val shape: (90, 7)
y val shape: (90,)


# Model Training

In [13]:
model_xgb = XGBClassifier(n_estimators = 150000,
                          learning_rate = 0.001,
                          tree_method = 'gpu_hist',
                          gpu_id= 0
                         )
model_xgb.fit(X,Y,
              eval_set = [(x_val,y_val)],
              early_stopping_rounds = 5,
              verbose = True
             )



[0]	validation_0-logloss:0.69264
[1]	validation_0-logloss:0.69214
[2]	validation_0-logloss:0.69164
[3]	validation_0-logloss:0.69114
[4]	validation_0-logloss:0.69064
[5]	validation_0-logloss:0.69014
[6]	validation_0-logloss:0.68964
[7]	validation_0-logloss:0.68914
[8]	validation_0-logloss:0.68865
[9]	validation_0-logloss:0.68815
[10]	validation_0-logloss:0.68766
[11]	validation_0-logloss:0.68716
[12]	validation_0-logloss:0.68667
[13]	validation_0-logloss:0.68618
[14]	validation_0-logloss:0.68569
[15]	validation_0-logloss:0.68522
[16]	validation_0-logloss:0.68476
[17]	validation_0-logloss:0.68427
[18]	validation_0-logloss:0.68380
[19]	validation_0-logloss:0.68334
[20]	validation_0-logloss:0.68287
[21]	validation_0-logloss:0.68241
[22]	validation_0-logloss:0.68193
[23]	validation_0-logloss:0.68146
[24]	validation_0-logloss:0.68100
[25]	validation_0-logloss:0.68054
[26]	validation_0-logloss:0.68005
[27]	validation_0-logloss:0.67959
[28]	validation_0-logloss:0.67914
[29]	validation_0-loglos

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.001, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=150000, n_jobs=2,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

# Test Data Preprocessing

In [14]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
passenger_id = pd.DataFrame()
passenger_id['PassengerId'] = test_df['PassengerId']

In [16]:
test_df['sex'] = [sex_to_categorical(test_df.loc[i]) for i in tqdm(range(test_df.shape[0]))]
test_df['embarked'] = [embark_to_categorical(test_df.loc[i]) for i in tqdm(range(test_df.shape[0]))]
del test_df['Sex']
del test_df['Embarked']
del test_df['Cabin']
del test_df['Ticket']
del test_df['Name']
del test_df['PassengerId']
test_df.head()

100%|██████████| 418/418 [00:00<00:00, 7760.42it/s]
100%|██████████| 418/418 [00:00<00:00, 7035.33it/s]


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,sex,embarked
0,3,34.5,0,0,7.8292,0,1
1,3,47.0,1,0,7.0,1,2
2,2,62.0,0,0,9.6875,0,1
3,3,27.0,0,0,8.6625,0,2
4,3,22.0,1,1,12.2875,1,2


# Preditions on Test set

In [17]:
y_pred = model_xgb.predict(test_df)

In [18]:
sample_submission = pd.read_csv('../input/titanic/gender_submission.csv')
sample_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [19]:
result = pd.DataFrame()
result['PassengerId'] = passenger_id['PassengerId']
result['Survived'] = y_pred
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [20]:
result.shape

(418, 2)

# Submission File 

In [21]:
result.to_csv('/kaggle/working/submission.csv', index=False)
print('Done!!!!')

Done!!!!
