In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import math
import numpy as np
import pandas as pd
from pandas.core.dtypes import common

from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [3]:
df_raw = pd.read_csv('docs/train.csv', low_memory=False)

In [4]:
def display_all(df):
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000):
        return display(df)

In [5]:
df_raw.shape

(891, 12)

In [6]:
display_all(df_raw.tail())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


#### *Name will unfortunately not help us much, although good to know! :)*

In [7]:
df_raw.drop('Name',axis=1, inplace=True)

In [8]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


#### *Percentage of data missing from each column*

In [9]:
df_raw.isnull().sum()/len(df_raw)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [10]:
df_raw.items??

#### Turn all strings into categorical type objects and replace all missing data with its average

In [11]:
def train_cats(df):
    for n,c in df.items():
        if common.is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
            
def fix_missing(df, col, name, na_dict):
    if common.is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def numericalize(df, col, name, max_n_cat):
    if not common.is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1

def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not common.is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

In [12]:
train_cats(df_raw)

In [13]:
df, y, nas = proc_df(df_raw, 'Survived')

In [14]:
m = RandomForestClassifier(n_jobs=-1)
m.fit(df, y)
m.score(df, y)

1.0

In [15]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 260
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

In [16]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_scores(m):
    res = [rmse(m.predict(X_valid), y_valid), m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

#### With no effort or information here is where we stand

In [17]:
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_train, y_train)
print_scores(m)

[0.38230072737812854, 1.0, 0.8538461538461538]


#### OOB score is worse than validation which is what we expect

In [18]:
m = RandomForestClassifier(n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_scores(m)

[0.3922322702763681, 0.9984152139461173, 0.8461538461538461, 0.8050713153724247]


In [19]:
RandomForestClassifier?

#### Slightly better score with 90 less trees

In [20]:
m = RandomForestClassifier(n_estimators=10, n_jobs=-1)
m.fit(X_train, y_train)
print_scores(m)

[0.4019184762342502, 0.9825673534072901, 0.8384615384615385]


In [21]:
m = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_samples_split=3, min_samples_leaf=3)
m.fit(X_train, y_train)
print_scores(m)

[0.3721042037676254, 0.919175911251981, 0.8615384615384616]


In [22]:
m = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_samples_leaf=3)
m.fit(X_train, y_train)
print_scores(m)

[0.4019184762342502, 0.9223454833597464, 0.8384615384615385]


In [23]:
m = RandomForestClassifier(n_estimators=300, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_scores(m)

[0.3721042037676254, 0.9175911251980983, 0.8615384615384616, 0.8129952456418383]


In [24]:
m = RandomForestClassifier(n_estimators=300, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_scores(m)

[0.3616202853397895, 0.9175911251980983, 0.8692307692307693, 0.820919175911252]


### Feature Importance

In [25]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

#### *Top 10 features, going to need all of them since we only have so many*

In [26]:
fi = rf_feat_importance(m, X_valid); fi[0:10]

Unnamed: 0,cols,imp
2,Sex,0.332825
6,Ticket,0.128755
7,Fare,0.124962
3,Age,0.099812
0,PassengerId,0.099402
1,Pclass,0.069616
8,Cabin,0.062101
4,SibSp,0.029385
9,Embarked,0.024843
5,Parch,0.019317


In [88]:
def get_oob(df):
    m = RandomForestClassifier(n_estimators=300, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
    x, _ = split_vals(df, n_trn)
    m.fit(x, y_train)
    return m.oob_score_

In [92]:
for c in ('SibSp','Parch'):
    print(get_oob(X_train.drop(c, axis=1)))

0.8145800316957211
0.8225039619651348


In [95]:
df_trn2, y_trn, nas = proc_df(df_raw, 'Survived', max_n_cat=7)

In [97]:
df_trn2.drop(['Parch'], axis=1, inplace=True)

In [98]:
X_train, X_valid = split_vals(df_trn2, n_trn)

In [99]:
m = RandomForestClassifier(n_estimators=300, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_scores(m)

[0.3721042037676254, 0.9239302694136292, 0.8615384615384616, 0.820919175911252]


### Resume here when ready to test

In [30]:
df_test = pd.read_csv('docs/test.csv')
df_test.drop('Name', inplace=True, axis=1)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,330911,7.8292,,Q
1,893,3,female,47.0,1,0,363272,7.0,,S
2,894,2,male,62.0,0,0,240276,9.6875,,Q
3,895,3,male,27.0,0,0,315154,8.6625,,S
4,896,3,female,22.0,1,1,3101298,12.2875,,S


In [31]:
df_test['Survived'] = 0

In [32]:
train_cats(df_test)
df_keep, y_keep, nas_keep = proc_df(df_test, 'Survived', max_n_cat=7)

In [33]:
df_keep.drop('Fare_na', inplace=True, axis=1)
print(df_keep.shape)

(418, 16)


In [34]:
test_preds = m.predict(df_keep)

In [47]:
df_final = pd.DataFrame(columns={'PassengerId', 'Survived'})

In [62]:
df_final['PassengerId'] = df_keep['PassengerId']
df_final['Survived'] = test_preds

In [72]:
df_final.head()

Unnamed: 0,Survived,PassengerId
0,0,892
1,0,893
2,0,894
3,0,895
4,1,896


In [74]:
df_final.to_csv('submission.csv', index=False)