In [20]:
import pandas as pd
import numpy as np

train_titanic = pd.read_csv('titanic_train.csv')
test_titanic = pd.read_csv('test-2.csv')

data = [train_titanic, test_titanic]

In [21]:
train_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
# Let us first create a new variable: the total number of relatives of a passenger
train_titanic['Relatives'] = train_titanic['SibSp']+train_titanic['Parch']
test_titanic['Relatives'] = test_titanic['SibSp']+test_titanic['Parch']
    
# Then we assign to the ages that are null the median age of the group of passengers with 
# same number of relatives and class of the train set, which is more representative
median = train_titanic.groupby(['Relatives','Pclass'])['Age'].median()
train_titanic['Age'] = train_titanic.groupby(['Relatives','Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

# We assign the same value also to the test set
for k in range(len(test_titanic)):
    if test_titanic['Age'].isnull()[k]:
        test_titanic['Age'][k]= median[test_titanic['Relatives'][k]][test_titanic['Pclass'][k]]


# train_titanic[train_titanic['Age'].isnull()==True]

# We see that there are still NaNs in the "Age" column.
# They have all Relatives == 10: sibsp==8 and Parch==2. 
# They are probably brothers of the same family. 
# I drop it from the train 
rel10 = train_titanic[train_titanic['Relatives']==10].index
train_titanic.drop(rel10, inplace=True)

# In the test set, there remain three NaN values.
# One has 8 SibSp. I suppose he is young.
# The other two have only one SibSp and 9 Parch. I suppose they are the parents.

test_titanic['Age'][188] = 14.5 # same value of the other one with rel==10 in the test and sibsp==8, presumibly a brother.
test_titanic['Age'][342] = 40
test_titanic['Age'][365] = 40

# REMARK: the exact number will not count too much, we'll bin the "Age" column

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_titanic['Age'][k]= median[test_titanic['Relatives'][k]][test_titanic['Pclass'][k]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_titanic['Age'][188] = 14.5 # same value of the other one with rel==10 in the test and sibsp==8, presumibly a brother.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_titanic['Age'][342] = 40
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pand

In [23]:
train_titanic[train_titanic['Age'].isnull()==True]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Relatives


In [24]:
# Fill the missing values of the "Fare" column on the test set with the average fare 

M = test_titanic['Fare'][test_titanic['Fare'].isnull()==False].mean()
test_titanic['Fare'] = test_titanic['Fare'].fillna(M)

# We also reduce skewness of Fare by applying log

train_titanic['Fare'] = np.log(train_titanic['Fare']+0.01)
test_titanic['Fare'] = np.log(test_titanic['Fare']+0.01)

In [25]:
# We now create new features.
# First, we extract the title from the name and turn it into dummy variables

train_titanic['Title']= train_titanic['Name'].str.extract(pat= ' ([A-Za-z]+)\.')
train_titanic = pd.get_dummies(train_titanic,columns=['Title'])

test_titanic['Title']= test_titanic['Name'].str.extract(pat= ' ([A-Za-z]+)\.')
test_titanic = pd.get_dummies(test_titanic,columns=['Title'])

In [26]:
# We also extract the surnames

train_titanic['Surname']= train_titanic['Name'].str.extract(pat= '([A-Za-z]+)\, ')
test_titanic['Surname']= test_titanic['Name'].str.extract(pat= '([A-Za-z]+)\, ')

# set surname value = 3 to the ones that do not have relatives

train_titanic['Surname'] = np.where(train_titanic['Relatives']==0, 3, train_titanic['Surname']) 
test_titanic['Surname'] = np.where(test_titanic['Relatives']==0, 3, test_titanic['Surname'])

# We create a function that for a surname 'sur' returns a survival rate of the people with surname == sur

def survival_rate(sur):
    surv_rate = train_titanic[train_titanic['Surname'] == sur ]['Survived'].mean()
    if surv_rate <= 0.3: m = 0
    elif surv_rate <= 0.6: m = 1
    else: m = 2 
    return m

# Create a new column using this information

train_titanic['survival_rate'] = train_titanic['Surname']

for sur in train_titanic['Surname'].unique():
    if sur == 3: continue # If it has no relatives with same surname, continue
    else:                 # Otherwise assign the survival rate of that surname
        m = survival_rate(sur)
        train_titanic['survival_rate'] = np.where(train_titanic['Surname'] == sur, m , train_titanic['survival_rate'])
        
# We use this on the test set: 

test_titanic['survival_rate'] = test_titanic['Surname']

for sur in test_titanic['Surname'].unique():
    if sur == 3: continue
    elif sur in train_titanic['Surname'].unique(): # If the surname appeared in the test set
        m = survival_rate(sur)
        # Assign the survival rate of that surname
        test_titanic['survival_rate'] = np.where(test_titanic['Surname'] == sur, m , test_titanic['survival_rate']) 
    else:
        # Otherwise, assign a new value, say = 4
        test_titanic['survival_rate'] = np.where(test_titanic['Surname'] == sur, 4 , test_titanic['survival_rate']) 
    
# Finally, turn survival_rate into a dummy variable

train_titanic = pd.get_dummies(train_titanic,columns=['survival_rate'])
test_titanic = pd.get_dummies(test_titanic,columns=['survival_rate'])

In [27]:
# Extract some other new variables that might help

train_titanic['Age_Class']= train_titanic['Age']* train_titanic['Pclass']
test_titanic['Age_Class']= test_titanic['Age']* test_titanic['Pclass']

train_titanic['Fare_Per_Person'] = train_titanic['Fare']/(train_titanic['Relatives']+1)
test_titanic['Fare_Per_Person']= test_titanic['Fare']/(test_titanic['Relatives']+1)

In [33]:
# Finally, turn "Sex" into a numerical variable

gender = {"male": 0, "female": 1}
train_titanic['Sex'] = train_titanic['Sex'].map(gender)
test_titanic['Sex'] = test_titanic['Sex'].map(gender)

# and bin "Fare" to reduce noise

train_titanic['Fare']=pd.qcut(train_titanic['Fare'],5,labels=False)
test_titanic['Fare']=pd.qcut(test_titanic['Fare'],5,labels=False)

ValueError: Bin edges must be unique: array([0. , 0. , 1. , 2.8, 3.4, 4. ]).
You can drop duplicate edges by setting the 'duplicates' kwarg

In [32]:
# Here are all the columns of our train set:

train_titanic[finals].head()

Unnamed: 0_level_0,Sex,Title_Mr,survival_rate_2,Title_Mrs,survival_rate_0,Fare_Per_Person,Age_Class,Fare,Pclass
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,1,0,0,1,0.99119,66.0,0,3
2,1,0,1,1,0,2.133401,38.0,4,1
3,1,0,0,0,0,2.071283,78.0,1,3
4,1,0,0,1,0,1.986183,35.0,4,1
5,0,1,0,0,0,2.086914,105.0,1,3


Now we're ready for the ML!

In [31]:
# We keep only the columns that after some trials revealed to be the most important

finals = ['Sex', 'Title_Mr', 'survival_rate_2', 'Title_Mrs',
         'survival_rate_0', 'Fare_Per_Person', 'Age_Class',
         'Fare', 'Pclass']

# Set 'PassengerId' as index

train_titanic.set_index('PassengerId',inplace=True)
test_titanic.set_index('PassengerId',inplace=True)

# and extract our final dataframes

X = train_titanic[finals]
y = train_titanic['Survived']

In [18]:
# We implement a random forest with max_depth=5, n_estimators=100

from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_depth=5,n_estimators=100)
forest.fit(X, y)
ypred = forest.predict(test_titanic[finals])


In [19]:
ext = pd.DataFrame({'Survived': ypred},test_titanic.index)
ext.to_csv(r'Desktopu')

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier().fit(X,y)
GB.fit(X, y)
ypred = GB.predict(test_titanic[finals])

In [None]:
# score of this code: 0.76076. Quite sad, considering that the gender submission scores 0.76.

# max score obtained: 0.78468