In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config Completer.use_jedi = False

In [2]:
# !unzip ./titanic.zip

In [3]:
train_df = pd.read_csv('./train.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
pd.set_option('display.max_rows', train_df.shape[0]+1)

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Problem Defination:
It's a **Binary Classification Problem**.

### Data
Data set is acquired from Kaggle itself
<a href='https://www.kaggle.com/c/titanic/data'>From Here</a>

* survival	Survival	      0 = No, 1 = Yes

* pclass      Ticket class	  1 = 1st, 2 = 2nd, 3 = 3rd

* sex	        Sex	

* Age	        Age in years	

* sibsp	    # of siblings / spouses aboard the Titanic	

* parch	    # of parents / children aboard the Titanic	

* ticket	    Ticket number	

* fare	    Passenger fare	

* cabin	    Cabin number	

* embarked	Port of Embarkation

Variable Notes

* pclass: A proxy for socio-economic status (SES)
    * 1st = Upper
    * 2nd = Middle
    * 3rd = Lower

* age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

* sibsp: The dataset defines family relations in this way...
* Sibling = brother, sister, stepbrother, stepsister
* Spouse = husband, wife (mistresses and fiancÃ©s were ignored)

* parch: The dataset defines family relations in this way...
* Parent = mother, father
* Child = daughter, son, stepdaughter, stepson
* Some children travelled only with a nanny, therefore parch=0 for them.

## Evaluation Metric
> Accuracy

# Framework to be followed
1. Check for NaN values
2. Fill the NaN values
3. Discard the unnecessary cols(if any)
4. Make All data numerical/categorical
5. Split into X and y
6. Train and evaluate the model
7. Test the model

## Check for NaN values

In [6]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Fill the NaN values

**Filling age according to Sex and PClass**

In [7]:
# Median age of Males
round(train_df[train_df['Sex']=='male']['Age'].median(), 0)

29.0

In [8]:
# Median age of Females
round(train_df[train_df['Sex']=='female']['Age'].median(),
     0)

27.0

In [9]:
# Grouping the Age by Sex and Pclass
train_df.groupby(['Sex', 'Pclass']).median()['Age']

Sex     Pclass
female  1         35.0
        2         28.0
        3         21.5
male    1         40.0
        2         30.0
        3         25.0
Name: Age, dtype: float64

In [10]:
train_df['Age']=train_df.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x:x.fillna(x.median()))

In [11]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
train_df[train_df.Embarked.isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [13]:
train_df.Embarked.mode()

0    S
dtype: object

In [14]:
train_df.Embarked.fillna('S', inplace=True)

In [15]:
train_df.Embarked.isna().sum()

0

In [16]:
# train_df.groupby(['Survived', 'Cabin']).head()

**Note** : We'll be leaving `Cabin` column

In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Discarding the Unnecessarry columns:
* Name
* Cabin
* Ticket

In [18]:
X = train_df.drop(['Name', 'Cabin', 'Ticket', 'Survived'], axis = 1)

In [19]:
y = train_df.Survived

### Converting the Columns into categories

In [20]:
X['Sex'] = X.Sex.map({'male':1, 'female': 0})

In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Sex          891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Fare         891 non-null    float64
 7   Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(1)
memory usage: 55.8+ KB


In [22]:
X.Embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

In [24]:
X.Embarked = X.Embarked.map({'S':1,
               'C':2,
               'Q':3})

In [25]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Sex          891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Fare         891 non-null    float64
 7   Embarked     891 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 55.8 KB


### Training and Evaluating the 3 models :
1. RandomForestClassifier
2. LogisticRegression
3. CatboostClassifier

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X,y,
                                                 test_size = 0.2)

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
models = {'RFC':RandomForestClassifier(),
         'LR':LogisticRegression(),
         'CatboostClass':CatBoostClassifier()}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    def_score = model.score(X_val, y_val)
    print(f'Model : {model_name} scored : {def_score}')

Model : RFC scored : 0.8324022346368715
Model : LR scored : 0.8044692737430168


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Learning rate set to 0.008911
0:	learn: 0.6896860	total: 183ms	remaining: 3m 2s
1:	learn: 0.6851192	total: 184ms	remaining: 1m 31s
2:	learn: 0.6802603	total: 185ms	remaining: 1m 1s
3:	learn: 0.6747858	total: 187ms	remaining: 46.6s
4:	learn: 0.6709686	total: 188ms	remaining: 37.5s
5:	learn: 0.6665014	total: 190ms	remaining: 31.4s
6:	learn: 0.6611687	total: 191ms	remaining: 27.1s
7:	learn: 0.6563444	total: 192ms	remaining: 23.9s
8:	learn: 0.6513759	total: 194ms	remaining: 21.3s
9:	learn: 0.6463226	total: 195ms	remaining: 19.3s
10:	learn: 0.6417577	total: 196ms	remaining: 17.7s
11:	learn: 0.6380413	total: 197ms	remaining: 16.2s
12:	learn: 0.6344312	total: 198ms	remaining: 15s
13:	learn: 0.6302857	total: 199ms	remaining: 14s
14:	learn: 0.6259775	total: 200ms	remaining: 13.2s
15:	learn: 0.6222320	total: 202ms	remaining: 12.4s
16:	learn: 0.6174718	total: 203ms	remaining: 11.7s
17:	learn: 0.6135865	total: 204ms	remaining: 11.1s
18:	learn: 0.6093063	total: 205ms	remaining: 10.6s
19:	learn: 0.6

272:	learn: 0.3674193	total: 501ms	remaining: 1.33s
273:	learn: 0.3672096	total: 502ms	remaining: 1.33s
274:	learn: 0.3670326	total: 503ms	remaining: 1.33s
275:	learn: 0.3669176	total: 504ms	remaining: 1.32s
276:	learn: 0.3668362	total: 505ms	remaining: 1.32s
277:	learn: 0.3665595	total: 506ms	remaining: 1.31s
278:	learn: 0.3663399	total: 508ms	remaining: 1.31s
279:	learn: 0.3661474	total: 509ms	remaining: 1.31s
280:	learn: 0.3658625	total: 510ms	remaining: 1.3s
281:	learn: 0.3656644	total: 512ms	remaining: 1.3s
282:	learn: 0.3654394	total: 513ms	remaining: 1.3s
283:	learn: 0.3651509	total: 514ms	remaining: 1.3s
284:	learn: 0.3648678	total: 516ms	remaining: 1.29s
285:	learn: 0.3644469	total: 517ms	remaining: 1.29s
286:	learn: 0.3644310	total: 518ms	remaining: 1.29s
287:	learn: 0.3639100	total: 519ms	remaining: 1.28s
288:	learn: 0.3636086	total: 520ms	remaining: 1.28s
289:	learn: 0.3632593	total: 522ms	remaining: 1.28s
290:	learn: 0.3629997	total: 524ms	remaining: 1.27s
291:	learn: 0.36

535:	learn: 0.3163731	total: 846ms	remaining: 732ms
536:	learn: 0.3162076	total: 848ms	remaining: 731ms
537:	learn: 0.3161404	total: 849ms	remaining: 729ms
538:	learn: 0.3160563	total: 851ms	remaining: 728ms
539:	learn: 0.3159495	total: 852ms	remaining: 726ms
540:	learn: 0.3158528	total: 853ms	remaining: 724ms
541:	learn: 0.3157211	total: 854ms	remaining: 722ms
542:	learn: 0.3156973	total: 856ms	remaining: 720ms
543:	learn: 0.3155842	total: 857ms	remaining: 718ms
544:	learn: 0.3155174	total: 858ms	remaining: 716ms
545:	learn: 0.3154489	total: 859ms	remaining: 714ms
546:	learn: 0.3154318	total: 860ms	remaining: 712ms
547:	learn: 0.3152514	total: 862ms	remaining: 711ms
548:	learn: 0.3149387	total: 863ms	remaining: 709ms
549:	learn: 0.3148251	total: 864ms	remaining: 707ms
550:	learn: 0.3146811	total: 866ms	remaining: 705ms
551:	learn: 0.3144731	total: 867ms	remaining: 703ms
552:	learn: 0.3143129	total: 868ms	remaining: 702ms
553:	learn: 0.3142834	total: 869ms	remaining: 700ms
554:	learn: 

792:	learn: 0.2814087	total: 1.19s	remaining: 312ms
793:	learn: 0.2811406	total: 1.2s	remaining: 310ms
794:	learn: 0.2810645	total: 1.2s	remaining: 309ms
795:	learn: 0.2809665	total: 1.2s	remaining: 307ms
796:	learn: 0.2808494	total: 1.2s	remaining: 306ms
797:	learn: 0.2807257	total: 1.2s	remaining: 304ms
798:	learn: 0.2804502	total: 1.2s	remaining: 303ms
799:	learn: 0.2803972	total: 1.2s	remaining: 301ms
800:	learn: 0.2803145	total: 1.21s	remaining: 300ms
801:	learn: 0.2800728	total: 1.21s	remaining: 298ms
802:	learn: 0.2799414	total: 1.21s	remaining: 297ms
803:	learn: 0.2797559	total: 1.21s	remaining: 295ms
804:	learn: 0.2796558	total: 1.21s	remaining: 294ms
805:	learn: 0.2795847	total: 1.21s	remaining: 292ms
806:	learn: 0.2793800	total: 1.22s	remaining: 291ms
807:	learn: 0.2793200	total: 1.22s	remaining: 289ms
808:	learn: 0.2790901	total: 1.22s	remaining: 288ms
809:	learn: 0.2790522	total: 1.22s	remaining: 286ms
810:	learn: 0.2789637	total: 1.22s	remaining: 284ms
811:	learn: 0.27876

Model : CatboostClass scored : 0.8324022346368715


In [34]:
model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'Logloss',
 'iterations': 1000,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'class_names': [0, 1],
 'random_seed': 0,
 'depth': 6,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'Logloss',
 'learning_rate': 0.00891099963337183,
 'score_function': 'Cosine',
 'task_type': 'CPU',


In [36]:
cat_model = CatBoostClassifier(iterations=1000, 
                              eval_metric = 'Accuracy')
cat_model.fit(X_train, y_train)
cat_model.score(X_val, y_val)

Learning rate set to 0.008911
0:	learn: 0.7893258	total: 19.6ms	remaining: 19.6s
1:	learn: 0.7893258	total: 20.4ms	remaining: 10.2s
2:	learn: 0.7893258	total: 21.2ms	remaining: 7.04s
3:	learn: 0.8075843	total: 22.5ms	remaining: 5.6s
4:	learn: 0.7921348	total: 23.4ms	remaining: 4.66s
5:	learn: 0.7977528	total: 24.6ms	remaining: 4.07s
6:	learn: 0.8033708	total: 25.7ms	remaining: 3.65s
7:	learn: 0.8146067	total: 26.8ms	remaining: 3.33s
8:	learn: 0.8230337	total: 28ms	remaining: 3.08s
9:	learn: 0.8216292	total: 29.5ms	remaining: 2.92s
10:	learn: 0.8202247	total: 30.8ms	remaining: 2.77s
11:	learn: 0.8146067	total: 31.6ms	remaining: 2.6s
12:	learn: 0.8075843	total: 32.3ms	remaining: 2.45s
13:	learn: 0.8117978	total: 33.4ms	remaining: 2.35s
14:	learn: 0.8117978	total: 34.6ms	remaining: 2.27s
15:	learn: 0.8160112	total: 35.8ms	remaining: 2.2s
16:	learn: 0.8174157	total: 36.9ms	remaining: 2.13s
17:	learn: 0.8202247	total: 38.1ms	remaining: 2.08s
18:	learn: 0.8202247	total: 39.3ms	remaining: 2.0

284:	learn: 0.8539326	total: 344ms	remaining: 863ms
285:	learn: 0.8525281	total: 345ms	remaining: 863ms
286:	learn: 0.8525281	total: 346ms	remaining: 860ms
287:	learn: 0.8525281	total: 347ms	remaining: 859ms
288:	learn: 0.8525281	total: 349ms	remaining: 858ms
289:	learn: 0.8525281	total: 350ms	remaining: 858ms
290:	learn: 0.8539326	total: 352ms	remaining: 857ms
291:	learn: 0.8539326	total: 353ms	remaining: 855ms
292:	learn: 0.8539326	total: 354ms	remaining: 854ms
293:	learn: 0.8539326	total: 355ms	remaining: 853ms
294:	learn: 0.8539326	total: 357ms	remaining: 852ms
295:	learn: 0.8539326	total: 358ms	remaining: 852ms
296:	learn: 0.8539326	total: 359ms	remaining: 851ms
297:	learn: 0.8539326	total: 361ms	remaining: 850ms
298:	learn: 0.8539326	total: 362ms	remaining: 849ms
299:	learn: 0.8539326	total: 363ms	remaining: 847ms
300:	learn: 0.8539326	total: 364ms	remaining: 846ms
301:	learn: 0.8539326	total: 366ms	remaining: 845ms
302:	learn: 0.8539326	total: 367ms	remaining: 844ms
303:	learn: 

528:	learn: 0.8806180	total: 681ms	remaining: 607ms
529:	learn: 0.8806180	total: 683ms	remaining: 605ms
530:	learn: 0.8820225	total: 684ms	remaining: 604ms
531:	learn: 0.8820225	total: 685ms	remaining: 603ms
532:	learn: 0.8820225	total: 686ms	remaining: 601ms
533:	learn: 0.8820225	total: 688ms	remaining: 600ms
534:	learn: 0.8806180	total: 689ms	remaining: 599ms
535:	learn: 0.8806180	total: 690ms	remaining: 598ms
536:	learn: 0.8806180	total: 692ms	remaining: 596ms
537:	learn: 0.8806180	total: 693ms	remaining: 595ms
538:	learn: 0.8806180	total: 694ms	remaining: 594ms
539:	learn: 0.8806180	total: 696ms	remaining: 593ms
540:	learn: 0.8806180	total: 697ms	remaining: 591ms
541:	learn: 0.8792135	total: 698ms	remaining: 590ms
542:	learn: 0.8806180	total: 699ms	remaining: 589ms
543:	learn: 0.8792135	total: 701ms	remaining: 587ms
544:	learn: 0.8792135	total: 702ms	remaining: 586ms
545:	learn: 0.8792135	total: 703ms	remaining: 585ms
546:	learn: 0.8792135	total: 704ms	remaining: 583ms
547:	learn: 

777:	learn: 0.9002809	total: 1.02s	remaining: 293ms
778:	learn: 0.9016854	total: 1.03s	remaining: 291ms
779:	learn: 0.9016854	total: 1.03s	remaining: 290ms
780:	learn: 0.9016854	total: 1.03s	remaining: 289ms
781:	learn: 0.9016854	total: 1.03s	remaining: 287ms
782:	learn: 0.9016854	total: 1.03s	remaining: 286ms
783:	learn: 0.9016854	total: 1.03s	remaining: 285ms
784:	learn: 0.9030899	total: 1.03s	remaining: 283ms
785:	learn: 0.9030899	total: 1.03s	remaining: 282ms
786:	learn: 0.9030899	total: 1.04s	remaining: 281ms
787:	learn: 0.9030899	total: 1.04s	remaining: 279ms
788:	learn: 0.9030899	total: 1.04s	remaining: 278ms
789:	learn: 0.9016854	total: 1.04s	remaining: 277ms
790:	learn: 0.9030899	total: 1.04s	remaining: 275ms
791:	learn: 0.9030899	total: 1.04s	remaining: 274ms
792:	learn: 0.9030899	total: 1.04s	remaining: 273ms
793:	learn: 0.9030899	total: 1.05s	remaining: 271ms
794:	learn: 0.9030899	total: 1.05s	remaining: 270ms
795:	learn: 0.9030899	total: 1.05s	remaining: 269ms
796:	learn: 

0.8324022346368715

## Making Predictions on TEST data

In [37]:
test_df = pd.read_csv('./test.csv')
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [38]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [39]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [40]:
test_df['Age']=test_df.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x:x.fillna(x.median()))

In [41]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [48]:
test_df.Fare.fillna(test_df.Fare.mean(), inplace = True)

In [49]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [50]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [52]:
X_test = test_df.drop(['Name', 'Ticket', 'Cabin'], axis = 1)

In [53]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    object 
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         418 non-null    float64
 7   Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 26.2+ KB


In [54]:
X_test['Sex'] = X_test.Sex.map({'male':1, 'female': 0})
X_test.Embarked = X_test.Embarked.map({'S':1,
               'C':2,
               'Q':3})

In [55]:
cat_model.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [56]:
submission = pd.DataFrame()

In [57]:
submission['PassengerId'] = X_test.PassengerId

In [58]:
submission['Survived'] = cat_model.predict(X_test)

In [62]:
submission.to_csv('./submission-1.csv', index = False)