In [0]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

test = pd.read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/test.csv")
train = pd.read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/train.csv")

In [35]:
#Determine Which Variables we have to play around with

train.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family'],
      dtype='object')

In [0]:
#Group some variables

train1 = train.copy()
train1.Age = train1.Age.round(-1)
train1.Fare = train1.Fare.round(-1)
train1['Fare1'] = train1.Fare.round(-10)
train1.Cabin = train1.Cabin.astype(str).str[0]
train1['Family'] = train1['SibSp'] + train1['Parch']

In [4]:
# Determine the % of people that survived based on class, sex, age, SibSp, parch, ticket, fare, cabin, embarked 

def group_by_mean_count(df, column):
  print(df['Survived'].groupby(df[column]).mean())
  print(df['Survived'].groupby(df[column]).count())

#Significant Range between class 1 and 3, would use for classification
group_by_mean_count(train1, 'Pclass')

#Significant Range between class <10 and 20s, would use for classification
group_by_mean_count(train1, 'Age')

#Significant Range between class Female and Male, would use for classification
group_by_mean_count(train1, 'Sex')

#Not a high range, would not use
group_by_mean_count(train1, 'Fare')

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64
Pclass
1    216
2    184
3    491
Name: Survived, dtype: int64
Age
0.0     0.704545
10.0    0.411765
20.0    0.354260
30.0    0.404494
40.0    0.424242
50.0    0.409836
60.0    0.352941
70.0    0.000000
80.0    1.000000
Name: Survived, dtype: float64
Age
0.0      44
10.0     34
20.0    223
30.0    178
40.0    132
50.0     61
60.0     34
70.0      7
80.0      1
Name: Survived, dtype: int64
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64
Sex
female    314
male      577
Name: Survived, dtype: int64
Fare
0.0      0.058824
10.0     0.256818
20.0     0.460000
30.0     0.437037
40.0     0.428571
50.0     0.433333
60.0     0.760000
70.0     0.263158
80.0     0.741935
90.0     0.923077
110.0    0.636364
120.0    1.000000
130.0    1.000000
140.0    0.666667
150.0    0.666667
160.0    1.000000
210.0    0.750000
220.0    0.000000
230.0    0.750000
250.0    0.500000
260.0    0.666667
510.0    1.

In [5]:
#High Range for extremes
group_by_mean_count(train1, 'Parch')

#High Range for extremes
group_by_mean_count(train1, 'SibSp')

#High Range for extremes
group_by_mean_count(train1, 'Family')

#High Range, would use first character
group_by_mean_count(train1, 'Cabin')

#Limited Difference, not useful
group_by_mean_count(train1, 'Embarked')

Parch
0    0.343658
1    0.550847
2    0.500000
3    0.600000
4    0.000000
5    0.200000
6    0.000000
Name: Survived, dtype: float64
Parch
0    678
1    118
2     80
3      5
4      4
5      5
6      1
Name: Survived, dtype: int64
SibSp
0    0.345395
1    0.535885
2    0.464286
3    0.250000
4    0.166667
5    0.000000
8    0.000000
Name: Survived, dtype: float64
SibSp
0    608
1    209
2     28
3     16
4     18
5      5
8      7
Name: Survived, dtype: int64
Family
0     0.303538
1     0.552795
2     0.578431
3     0.724138
4     0.200000
5     0.136364
6     0.333333
7     0.000000
10    0.000000
Name: Survived, dtype: float64
Family
0     537
1     161
2     102
3      29
4      15
5      22
6      12
7       6
10      7
Name: Survived, dtype: int64
Cabin
A    0.466667
B    0.744681
C    0.593220
D    0.757576
E    0.750000
F    0.615385
G    0.500000
T    0.000000
n    0.299854
Name: Survived, dtype: float64
Cabin
A     15
B     47
C     59
D     33
E     32
F     13
G      4
T  

In [0]:
#Clean Data to use those that appear to have difficant implications on survival

#use first character of Cabin Variable
def first_char(df, column):
  df[column] = df[column].astype(str).str[0]
  return df

train = first_char(train, 'Cabin')
test = first_char(test, 'Cabin')

def string_to_float(df, column, val1, val2):
  df[column][df[column] == val1] = 1
  df[column][df[column] == val2] = 0
  return df

train = string_to_float(train, 'Sex', 'male', 'female')
test = string_to_float(test, 'Sex', 'male', 'female')

def char_to_float(df, column):
  df[column] = df[column].transform(lambda x: ord(x))
  return df

train = char_to_float(train, 'Cabin')
test = char_to_float(test, 'Cabin')

def family(df):
  df['Family'] = df['SibSp'] + df['Parch']
  return df

train = family(train)
test = family(test)

In [7]:
train.isna().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked        True
Family         False
dtype: bool

In [8]:
test.isna().any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin          False
Embarked       False
Family         False
dtype: bool

In [0]:
def mean_impute(df, column):
  df[column] = df[column].transform(lambda x: x.fillna(x.mean()))
  return df
  
train = mean_impute(train, 'Age')
test = mean_impute(test, 'Age')

train = mean_impute(train, 'Fare')
test = mean_impute(test, 'Fare')

In [0]:
#https://www.kaggle.com/zlatankr/titanic-random-forest-82-78#SibSp

def test_model(df, rf):
  result = pd.concat((pd.DataFrame(df.iloc[:, 1:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]
  return result

In [26]:
# Import the model
from sklearn.ensemble import RandomForestClassifier

train1 = train.drop(['Embarked', 'PassengerId', 'Name', 'Ticket'], axis=1)
test1 = test.drop(['Embarked', 'PassengerId', 'Name', 'Ticket'], axis=1)

rf1 = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf1.fit(train1.iloc[:, 1:], train1.iloc[:, 0])
print("%.4f" % rf1.oob_score_)

0.8249


In [27]:
test_model(train1, rf1)

Unnamed: 0,variable,importance
1,Sex,0.34327
5,Fare,0.210804
2,Age,0.176421
0,Pclass,0.0852
6,Cabin,0.073719
7,Family,0.058633
3,SibSp,0.029465
4,Parch,0.022487


In [28]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

train2 = train.drop(['Fare', 'Embarked', 'PassengerId', 'Name', 'Ticket', 'Parch', 'SibSp'], axis=1)
test2 = test.drop(['Fare', 'Embarked', 'PassengerId', 'Name', 'Ticket', 'Parch', 'SibSp'], axis=1)

rf2 = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf2.fit(train2.iloc[:, 1:], train2.iloc[:, 0])
print("%.4f" % rf2.oob_score_)

0.8182


In [30]:
test_model(train2, rf2)

Unnamed: 0,variable,importance
1,Sex,0.405653
2,Age,0.273607
0,Pclass,0.116928
3,Cabin,0.105319
4,Family,0.098493


In [0]:
import pickle
#https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

pickle.dump(rf1, open('model.sav', 'wb'))

In [39]:
train2.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,Cabin,Family
0,0,3,1,22.0,110,1


In [41]:
predictions = rf1.predict(test1)
predictions = pd.DataFrame(predictions, columns=['Survived'])

predictions

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,1
9,0
