# Preparation:
Import packages and Load data

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Cleaning and Exploration:
Check some insights about data

In [22]:
print(data.shape)
print(data.dtypes)
data.head()

(891, 12)
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
# Checking values of each column
print(data['Survived'].unique())
print(data['Pclass'].unique())
print(data['Sex'].unique())
print(data['Age'].unique())
print(data['SibSp'].unique())
print(data['Parch'].unique())
print(data['Fare'].unique())
print(data['Cabin'].unique())
print(data['Embarked'].unique())

[0 1]
[3 1 2]
['male' 'female']
[22.   38.   26.   35.     nan 54.    2.   27.   14.    4.   58.   20.
 39.   55.   31.   34.   15.   28.    8.   19.   40.   66.   42.   21.
 18.    3.    7.   49.   29.   65.   28.5   5.   11.   45.   17.   32.
 16.   25.    0.83 30.   33.   23.   24.   46.   59.   71.   37.   47.
 14.5  70.5  32.5  12.    9.   36.5  51.   55.5  40.5  44.    1.   61.
 56.   50.   36.   45.5  20.5  62.   41.   52.   63.   23.5   0.92 43.
 60.   10.   64.   13.   48.    0.75 53.   57.   80.   70.   24.5   6.
  0.67 30.5   0.42 34.5  74.  ]
[1 0 3 4 2 5 8]
[0 1 2 5 3 4 6]
[  7.25    71.2833   7.925   53.1      8.05     8.4583  51.8625  21.075
  11.1333  30.0708  16.7     26.55    31.275    7.8542  16.      29.125
  13.      18.       7.225   26.       8.0292  35.5     31.3875 263.
   7.8792   7.8958  27.7208 146.5208   7.75    10.5     82.1708  52.
   7.2292  11.2417   9.475   21.      41.5792  15.5     21.6792  17.8
  39.6875   7.8     76.7292  61.9792  27.75    46.9    

In [52]:
# Creating a function to clean data (convert categorial in numerical and give some gestion to nan values)
sex = {'male' : 0, 'female' : 1}
embark = dict(zip(data['Embarked'].unique(), range(len(data['Embarked'].unique()))))

def clean_data(row):
    
        row['sex_ label'] = sex[row['Sex']]
        row['emb_label'] = embark[row['Embarked']]
        
        return row

features = ['Pclass', 'SibSp', 'Parch', 'Fare', 'sex_ label', 'emb_label']
X = data.apply(clean_data, axis = 1)[features]
X_test = test.apply(clean_data, axis = 1)[features]
X_test['Fare'] = X_test['Fare'].fillna(value = np.mean(X_test['Fare']))
# After see missing data in age variable, I decided ignore it. I think that is a good aproximation for this first iteratiton

# Modeling
In this step I try to find a good model to predict who survived

In [26]:
X_train, x_test, y_train, y_test = train_test_split(X, data['Survived'])
sc = StandardScaler() # Create a standardscaler object
x_train = sc.fit_transform(X_train) # I normalize the data to train
x_test = sc.transform(x_test)

In [27]:
# K Neighbors Classifier
knn = KNeighborsClassifier(n_neighbors = 10).fit(x_train, y_train)
print("The accuracy of K Neighbors Classifier is {}".format(knn.score(x_test, y_test)))

#  Decision Tree Classifier
dtc = DecisionTreeClassifier(max_depth=3).fit(x_train, y_train)
print("The accuracy of Decision Tree Classifier is {}".format(dtc.score(x_test, y_test)))

# Logistic Regression
logreg = LogisticRegression().fit(x_train, y_train)
print("The accuracy of Logistic Regression is {}".format(logreg.score(x_test, y_test)))

# Support Vector Classifier, poly kernel
svcpoly = SVC(kernel='poly', degree = 3).fit(x_train,y_train)
print("The accuracy of Support Vector Classifier, poly kernel is {}".format(svcpoly.score(x_test, y_test))) 

# Support Vector Classifier, rbf kernel
svcrbf = SVC(kernel = 'rbf', gamma=0.1).fit(x_train, y_train)
print("The accuracy of Support Vector Classifier, rbf kernel is {}".format(svcrbf.score(x_test, y_test))) 

# Random Forest Classifier
rfc = RandomForestClassifier(random_state = 0).fit(x_train, y_train)
print("The accuracy of Random Forest Classifier is {}".format(rfc.score(x_test, y_test))) 

# Gradient Boosting Classifier
gbc = GradientBoostingClassifier(random_state = 0, learning_rate=0.2).fit(x_train, y_train)
print("The accuracy of Gradient Boosting Classifier is {}".format(gbc.score(x_test, y_test))) 

The accuracy of K Neighbors Classifier is 0.7713004484304933
The accuracy of Decision Tree Classifier is 0.7757847533632287
The accuracy of Logistic Regression is 0.7757847533632287
The accuracy of Support Vector Classifier, poly kernel is 0.7892376681614349
The accuracy of Support Vector Classifier, rbf kernel is 0.7982062780269058
The accuracy of Random Forest Classifier is 0.7892376681614349
The accuracy of Gradient Boosting Classifier is 0.7982062780269058


# Result
I selected Gradient boosting Classifier

In [53]:
y_result = gbc.predict(sc.transform(X_test))

In [54]:
y_result

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [74]:
ans = pd.DataFrame(np.concatenate((test['PassengerId'].values.reshape(-1, 1), y_result.reshape(-1, 1)), axis = 1), columns = ['PassengerId', 'Survived'])

In [78]:
ans.set_index('PassengerId', inplace = True)
ans.to_csv('data/result.csv')