This notebook took https://www.kaggle.com/hitesh1724/titanic-1-fastai-beginner-tutorial/ as reference.

The notebook uses TabularPandas in fastai2 to help manage the datasets, and use sklearn's RandomForestClassifier to create a classification model.

If you have any questions, ask away!

# Preparation

## Environment Check

### Check datasets

To have a more accurate result, use titanic-extended dataset provided in https://www.kaggle.com/pavlofesenko/titanic-extended.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

../input/one_hot_train.csv
../input/test.csv
../input/one_hot_test.csv
../input/train.csv
../input/y.csv
../input/sample_submission.csv


### Check fastai version

In [2]:
import fastai
fastai.__version__ 

'2.2.7'

## Import library

In [3]:
from fastai import *
from fastai.tabular.all import *

pd.options.display.max_rows = 20
pd.options.display.max_columns = None

# Datasets Handling

In [4]:
path = Path('../input')

In [5]:
path.ls()

(#6) [Path('../input/one_hot_train.csv'),Path('../input/test.csv'),Path('../input/one_hot_test.csv'),Path('../input/train.csv'),Path('../input/y.csv'),Path('../input/sample_submission.csv')]

In [6]:
df = pd.read_csv(path/'train.csv', low_memory=False)
df_test = pd.read_csv(path/'test.csv', low_memory=False)

In [7]:
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,A,LO,A,A,A,A,B,D,D,B,0.629858,0.855349,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,F,HJ,A,B,A,B,D,B,D,B,0.370727,0.328929,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,L,DJ,A,B,A,A,B,D,D,B,0.502272,0.322749,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,F,KV,A,A,A,A,B,D,D,B,0.934242,0.707663,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,A,DP,A,A,A,B,B,B,D,B,0.254427,0.274514,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


## Create TabularPandas

In [8]:
procs = [Categorify, FillMissing] # RF does not need `Normalize`

In [9]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df))

In [10]:
dep_var='target'

In [11]:
cont,cat = cont_cat_split(df, 1, dep_var=dep_var)

In [12]:
cont

['id',
 'cont0',
 'cont1',
 'cont2',
 'cont3',
 'cont4',
 'cont5',
 'cont6',
 'cont7',
 'cont8',
 'cont9',
 'cont10']

In [13]:
cat

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18']

In [14]:
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits, y_block=CategoryBlock())

In [15]:
to.show(3)

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,id,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
208653,A,I,A,A,F,BI,A,E,Q,A,LN,A,A,A,A,B,D,D,B,347883,0.175467,0.28653,0.371006,0.353804,0.798516,0.421779,0.429086,0.391157,0.388384,0.680219,0.404015,1
78000,A,F,Q,B,F,AB,A,AH,K,A,JK,A,A,A,B,D,B,D,B,130359,0.701511,0.528041,0.49425,0.235237,0.235859,0.083723,0.325783,0.607549,0.371303,0.588315,0.590592,0
278428,A,G,G,F,E,BI,G,AS,BM,A,CK,B,A,B,B,D,B,B,D,464131,0.323367,0.216466,0.253257,0.569214,0.74285,0.732721,0.735169,0.179033,0.316893,0.262981,0.312343,1


## Create Training Set and Validation Set

In [16]:
X_train, y_train = to.train.xs,to.train.y
X_valid, y_valid = to.valid.xs,to.valid.y

#  Classifier Model

In [17]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split,StratifiedKFold # Model evaluation
from sklearn.preprocessing import LabelEncoder, RobustScaler, OneHotEncoder, StandardScaler # Preprocessing
from sklearn.linear_model import Lasso, Ridge, ElasticNet,  LassoLarsIC, RANSACRegressor, SGDRegressor, HuberRegressor, BayesianRidge # Linear models
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor  # Ensemble methods
from xgboost import XGBRegressor, plot_importance # XGBoost
from sklearn.svm import SVR, SVC, LinearSVC  # Support Vector Regression
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline # Streaming pipelines
from sklearn.decomposition import KernelPCA, PCA # Dimensionality reduction
from sklearn.feature_selection import SelectFromModel # Dimensionality reduction
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV # Model evaluation
from sklearn.base import clone, BaseEstimator, TransformerMixin, RegressorMixin # Clone estimator
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import explained_variance_score, roc_auc_score, median_absolute_error, r2_score, mean_squared_error #To evaluate our model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split 
import pandas_profiling

In [19]:
m = RandomForestClassifier(n_estimators=100, n_jobs=-1)
m.fit(X_train,y_train)

In [33]:
params={'metric': 'auc', 'reg_alpha': 6.010538011450937, 'reg_lambda': 0.031702113663443346, 'colsample_bytree': 0.27,
   'subsample': 0.6, 'learning_rate': 0.05, 'max_depth': 100, 'num_leaves': 100, 'min_child_samples': 216,
   'cat_smooth': 87, 'random_state': 48,'n_estimators': 20000}
m = LGBMClassifier(**params)
m.fit(X_train,y_train)

LGBMClassifier(cat_smooth=87, colsample_bytree=0.27, learning_rate=0.05,
               max_depth=100, metric='auc', min_child_samples=216,
               n_estimators=20000, num_leaves=100, random_state=48,
               reg_alpha=6.010538011450937, reg_lambda=0.031702113663443346,
               subsample=0.6)

In [33]:
params={'metric': 'auc', 'reg_alpha': 6.010538011450937, 'reg_lambda': 0.031702113663443346, 'colsample_bytree': 0.27,
   'subsample': 0.6, 'learning_rate': 0.05, 'max_depth': 100, 'num_leaves': 100, 'min_child_samples': 216,
   'cat_smooth': 87, 'random_state': 48,'n_estimators': 20000}
preds = np.zeros(len(to_test))        
kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True) #As we can see the data is unbalanced that's why I'll use StratifiedKFold to split data: Don't want all zeros in a split                 
auc=[]   # list contains AUC for each fold  
n=0   
for trn_idx, test_idx in kf.split(X_train,y_train):
    X_tr,X_val=X_train.iloc[trn_idx],X_train.iloc[test_idx]
    y_tr,y_val=y_train.iloc[trn_idx],y_train.iloc[test_idx]
    model = LGBMClassifier(**params) 
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False) 
    preds+=model.predict_proba(to_test)[:, 1]/kf.n_splits 
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])) 
    print(n+1,auc[n])                                                                                       
    n+=1    

1 0.8949034185577046
2 0.8967158234858912
3 0.8932459003830435
4 0.8966598990207184
5 0.8926213669392072


In [31]:
model.predict_proba(to_test[:len(to_test)])

array([[0.61792316, 0.38207684],
       [0.76915491, 0.23084509],
       [0.74210975, 0.25789025],
       ...,
       [0.61555557, 0.38444443],
       [0.64824802, 0.35175198],
       [0.64055191, 0.35944809]])

## Validate

In [21]:
from sklearn.metrics import accuracy_score

In [34]:
y_pred=m.predict(X_valid)

In [37]:
accuracy_score(y_valid, y_pred)

0.8450333333333333

# Test Dataset

In [32]:
to_test = TabularPandas(df_test, procs, cat, cont)
to_test = to_test[:len(to_test)]

In [None]:
predicted_result = model.predict(to_test.xs) # remove reduntant columns (training did not use this col)

In [34]:
output= pd.DataFrame({'id':df_test.id, 'target': preds})
output.to_csv('../output/fastai.csv', index=False)
output.head()

Unnamed: 0,id,target
0,5,0.323795
1,6,0.197997
2,8,0.260533
3,9,0.330279
4,11,0.223772


In [30]:
output

Unnamed: 0,id,target
0,5,0
1,6,0
2,8,0
3,9,0
4,11,0
...,...,...
199995,499983,1
199996,499984,0
199997,499987,1
199998,499994,0


In [31]:
to_test

            id  cat0  cat1  cat2  cat3  cat4  cat5  cat6  cat7  cat8  cat9  \
0            5     1     6     1     1     6    34     1     9    24     1   
1            6     1     8     3     1     5     3     8    32    52     1   
2            8     1    14     3     1     6     3     1     9    29     1   
3            9     2    12     3     1     6    34     1    31    24     1   
4           11     1     6     1     2     6    34     1     9    47     1   
...        ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   
199995  499983     1    14     1     4     6    34     1     7    24     1   
199996  499984     2     9     1     2     5    34     1    20    56     5   
199997  499987     1    12     4     1     8    34     1    15    48     1   
199998  499994     1    11     1     3     6    34     1    44    39     1   
199999  499998     1    11     1     1     5    34     3    16    39     8   

        cat10  cat11  cat12  cat13  cat14  cat15  cat16  cat17 