In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


**1. Input our data and than DecisionTreeClassifier:**

In [2]:
from sklearn.tree import DecisionTreeClassifier


**2. Load our data from .csv files (first 10 rows), and inspect it:**

In [3]:
my_data = pd.read_csv('/kaggle/input/titanic/train.csv')
my_data.head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


**3. Check our data for data-gaps:**

In [4]:
my_data.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

**4. We have 2 different arrays (train and test):**

In [5]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

 5. Let's choose the most significant parameters for the classifier that can affect the passenger's survival. We can discard the rest. For example, it is important:
* Accommodation of a passenger (more chances to survive)
* Gender (affects physical strength - more likely to survive)
* Couples on board (affects survival)
* Having children on board (also affects survival)
 
 The rest of the factors can have less or no influence, so we discard them when substituting independent variables into our function X.

In [6]:
X = train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']].values
X[0:5]

array([[3, 'male', 22.0, 1, 0],
       [1, 'female', 38.0, 1, 0],
       [3, 'female', 26.0, 0, 0],
       [1, 'female', 35.0, 1, 0],
       [3, 'male', 35.0, 0, 0]], dtype=object)

In [7]:
X_test = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']].values
X[0:5]

array([[3, 'male', 22.0, 1, 0],
       [1, 'female', 38.0, 1, 0],
       [3, 'female', 26.0, 0, 0],
       [1, 'female', 35.0, 1, 0],
       [3, 'male', 35.0, 0, 0]], dtype=object)

**6. We have categorical axes (Gender), change its value to binary (1 or 0), turn male in "1", female in "0"**

In [8]:
from sklearn import preprocessing
sex = preprocessing.LabelEncoder()
sex.fit(['female','male'])
X[:,1] = sex.transform(X[:,1]) 
X[0:5]


array([[3, 1, 22.0, 1, 0],
       [1, 0, 38.0, 1, 0],
       [3, 0, 26.0, 0, 0],
       [1, 0, 35.0, 1, 0],
       [3, 1, 35.0, 0, 0]], dtype=object)

* **6.1. And the same for test dataset:**

In [9]:
X_test[:,1] = sex.transform(X_test[:,1]) 
X_test[0:5]

array([[3, 1, 34.5, 0, 0],
       [3, 0, 47.0, 1, 0],
       [2, 1, 62.0, 0, 0],
       [3, 1, 27.0, 0, 0],
       [3, 0, 22.0, 1, 1]], dtype=object)

**7. Now fill the target variable Y:**

In [10]:
y_train = train_df["Survived"]
y_train [0:5]

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

**8. Then count and fill missing values:**

In [11]:
missing_val_count_by_column = (train_df.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Age         177
Cabin       687
Embarked      2
dtype: int64


**9. We have troubles with "Age" (missing values, NaNs). So use inmuter:**

In [12]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
X_clean = my_imputer.fit_transform(X)
X_test_clean = my_imputer.fit_transform(X_test)

**10. At last we make train our model with "clean" data and than make predictions on the testing dataset and store it into a variable called "Y_pred"**

In [13]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_clean, y_train)
Y_pred = decision_tree.predict(X_test_clean)

**11. We calculate the accurancy of our model:**

In [14]:
acc_decision_tree = decision_tree.score(X_clean, y_train)
acc_decision_tree

0.920314253647587

**12. Final, generate the output of our data:**

In [15]:
test_df ["Survived"] = Y_pred
new = test_df [['PassengerId','Survived']]
new.to_csv('pred.csv',index=False)