## Import Library

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, recall_score

## Load dataset

In [2]:
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## Data preprocessing

### Label Encoding

In [3]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


**Cara 1**

In [4]:
le = preprocessing.LabelEncoder()
le.fit(df['embark_town'])
df['embark_town_label_encoding'] = le.transform(df['embark_town']) 

In [5]:
Embarked_Label_Encoding = df[['embark_town','embark_town_label_encoding',]]
Embarked_Label_Encoding.head()

Unnamed: 0,embark_town,embark_town_label_encoding
0,Southampton,2
1,Cherbourg,0
2,Southampton,2
3,Southampton,2
4,Southampton,2


**Cara 2**

In [6]:
df['embark_town_label_cat'] = df['embark_town'].astype('category').cat.codes

In [7]:
embark_town_label_cat = df[['embark_town','embark_town_label_encoding','embark_town_label_cat']]
embark_town_label_cat.head()

Unnamed: 0,embark_town,embark_town_label_encoding,embark_town_label_cat
0,Southampton,2,2
1,Cherbourg,0,0
2,Southampton,2,2
3,Southampton,2,2
4,Southampton,2,2


In [8]:
embark_town_label_cat.drop_duplicates()

Unnamed: 0,embark_town,embark_town_label_encoding,embark_town_label_cat
0,Southampton,2,2
1,Cherbourg,0,0
5,Queenstown,1,1
61,,3,-1


### Ordinal Encoding

In [9]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [10]:
map_class = {'First':3,
             'Second':2,
             'Third':1}
df['class_cat'] = df['class'].map(map_class)
df[['class','class_cat']].head()

Unnamed: 0,class,class_cat
0,Third,1
1,First,3
2,Third,1
3,First,3
4,Third,1


In [11]:
df[['class','class_cat']].drop_duplicates()

Unnamed: 0,class,class_cat
0,Third,1
1,First,3
9,Second,2


### One Hot Encoding

In [12]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [13]:
dummies_embark_town = pd.get_dummies(df['embark_town'],prefix='embark_town')
dummies_embark_town.head()

Unnamed: 0,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [14]:
df = pd.concat([df, dummies_embark_town], axis=1)
df[['embark_town','embark_town_Cherbourg','embark_town_Queenstown','embark_town_Southampton']].head()

Unnamed: 0,embark_town,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,Southampton,0,0,1
1,Cherbourg,1,0,0
2,Southampton,0,0,1
3,Southampton,0,0,1
4,Southampton,0,0,1


### Frequency Encoding

In [15]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [16]:
freq_et = df['embark_town'].value_counts().reset_index()
freq_et.rename(columns={"index": "embark_town", "embark_town": "freq_embark_town"}, inplace = True)
freq_et['pct_embark_town'] = round((freq_et['freq_embark_town']/freq_et['freq_embark_town'].sum())*100,2)
freq_et

Unnamed: 0,embark_town,freq_embark_town,pct_embark_town
0,Southampton,644,72.44
1,Cherbourg,168,18.9
2,Queenstown,77,8.66


In [17]:
df = df.merge(freq_et[['embark_town','pct_embark_town']], on='embark_town', how='inner')
df[['embark_town','pct_embark_town']]

Unnamed: 0,embark_town,pct_embark_town
0,Southampton,72.44
1,Southampton,72.44
2,Southampton,72.44
3,Southampton,72.44
4,Southampton,72.44
...,...,...
884,Queenstown,8.66
885,Queenstown,8.66
886,Queenstown,8.66
887,Queenstown,8.66


### Mean Encoding

In [18]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


**Hitung kemunculan category pada dataset**

In [19]:
freq_et= df['embark_town'].value_counts().reset_index()
freq_et.rename(columns={"index": "embark_town", "embark_town": "freq_embark_town"}, inplace = True)
freq_et

Unnamed: 0,embark_town,freq_embark_town
0,Southampton,644
1,Cherbourg,168
2,Queenstown,77


**Jumlahkan value target(survived) berdasarkan category**

In [20]:
sum_et = df.groupby(['embark_town']).agg({'survived':sum}).reset_index()
sum_et.rename(columns={"survived": "sum_embark_town"}, inplace = True)
sum_et

Unnamed: 0,embark_town,sum_embark_town
0,Cherbourg,93
1,Queenstown,30
2,Southampton,217


**Hitung mean encoding untuk embark_town**

In [21]:
mean_ec = freq_et.merge(sum_et, on='embark_town', how='inner')
mean_ec['mean_ec_embark_town'] = round((mean_ec['sum_embark_town']/mean_ec['freq_embark_town']),2)
mean_ec

Unnamed: 0,embark_town,freq_embark_town,sum_embark_town,mean_ec_embark_town
0,Southampton,644,217,0.34
1,Cherbourg,168,93,0.55
2,Queenstown,77,30,0.39


In [22]:
df = df.merge(mean_ec[['embark_town','mean_ec_embark_town']], on='embark_town', how='inner')
df[['embark_town','mean_ec_embark_town']]

Unnamed: 0,embark_town,mean_ec_embark_town
0,Southampton,0.34
1,Southampton,0.34
2,Southampton,0.34
3,Southampton,0.34
4,Southampton,0.34
...,...,...
884,Queenstown,0.39
885,Queenstown,0.39
886,Queenstown,0.39
887,Queenstown,0.39


## Imbalanced Data handling

In [23]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [24]:
target = df['survived'].value_counts().reset_index()
target.rename(columns={'index':'survived','survived':'freq'}, inplace=True)
target['percentage'] = round((target['freq']/target['freq'].sum())*100,2)
target

Unnamed: 0,survived,freq,percentage
0,0,549,61.62
1,1,342,38.38


In [25]:
df_numeric = df[['survived','pclass','age','sibsp','parch','fare']]
df_numeric = df_numeric.dropna()
df_numeric.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,0,3,35.0,0,0,8.05


In [26]:
df_numeric['survived'].value_counts()

0    424
1    290
Name: survived, dtype: int64

In [27]:
from imblearn import under_sampling, over_sampling

X = df_numeric.drop(['survived'],axis=1)
y = df_numeric['survived']

# Undersampling
X_under, y_under = under_sampling.RandomUnderSampler().fit_resample(X, y)

# Overampling
X_over, y_over = over_sampling.RandomOverSampler().fit_resample(X, y)

# Overampling with smote
X_over_smote, y_over_smote = over_sampling.SMOTE().fit_resample(X, y)

In [28]:
df_undersampling = pd.concat([X_under, y_under], axis=1)
df_undersampling.head(3)

Unnamed: 0,pclass,age,sibsp,parch,fare,survived
0,2,24.0,2,0,73.5,0
1,1,50.0,0,0,28.7125,0
2,2,23.0,2,1,11.5,0


In [29]:
df_oversampling = pd.concat([X_over, y_over], axis=1)
df_oversampling.head(3)

Unnamed: 0,pclass,age,sibsp,parch,fare,survived
0,3,22.0,1,0,7.25,0
1,1,38.0,1,0,71.2833,1
2,3,26.0,0,0,7.925,1


In [30]:
df_oversampling_smote = pd.concat([X_over_smote, y_over_smote], axis=1)
df_oversampling_smote.head(3)

Unnamed: 0,pclass,age,sibsp,parch,fare,survived
0,3,22.0,1,0,7.25,0
1,1,38.0,1,0,71.2833,1
2,3,26.0,0,0,7.925,1


In [31]:
df_undersampling['survived'].value_counts()

1    290
0    290
Name: survived, dtype: int64

In [32]:
df_oversampling['survived'].value_counts()

1    424
0    424
Name: survived, dtype: int64

In [33]:
df_oversampling_smote['survived'].value_counts()

1    424
0    424
Name: survived, dtype: int64

## Random Forest with balance data

In [34]:
df = sns.load_dataset('titanic')
df = df.dropna().reset_index(drop=True)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
1,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
2,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
3,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
4,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [35]:
df['survived'].value_counts()

1    123
0     59
Name: survived, dtype: int64

### Data preproc - Sex

cara 1

In [36]:
sex = pd.get_dummies(df['sex'],prefix='sex')
sex.head(3)

Unnamed: 0,sex_female,sex_male
0,1,0
1,1,0
2,0,1


In [37]:
df = pd.concat([df, sex], axis=1)
df[['sex','sex_female','sex_male']].head()

Unnamed: 0,sex,sex_female,sex_male
0,female,1,0
1,female,1,0
2,male,0,1
3,female,1,0
4,female,1,0


In [38]:
df = df.drop(['sex','sex_male'], axis=1)
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,sex_female
0,1,1,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1
1,1,1,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1
2,0,1,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,0
3,1,3,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False,1
4,1,1,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True,1


cara 2

In [39]:
df = sns.load_dataset('titanic')
df = df.dropna().reset_index(drop=True)
df['sex'] = df['sex'].replace("female", 0).replace("male", 1)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,1,1,0,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
1,1,1,0,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
2,0,1,1,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
3,1,3,0,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
4,1,1,0,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


### All Data Category

In [40]:
df =pd.get_dummies(data=df,columns=['embarked', 'class', 'who',
                                      'adult_male','deck','embark_town',
                                      'alive','alone'],drop_first=True)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_Q,embarked_S,class_Second,...,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Queenstown,embark_town_Southampton,alive_yes,alone_True
0,1,1,0,38.0,1,0,71.2833,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,1,0,35.0,1,0,53.1,0,1,0,...,0,1,0,0,0,0,0,1,1,0
2,0,1,1,54.0,0,0,51.8625,0,1,0,...,0,0,0,1,0,0,0,1,0,1
3,1,3,0,4.0,1,1,16.7,0,1,0,...,0,0,0,0,0,1,0,1,1,0
4,1,1,0,58.0,0,0,26.55,0,1,0,...,0,1,0,0,0,0,0,1,1,1


### Balanced Data

In [41]:
sm = over_sampling.SMOTE(random_state=42)

X = df.drop(['survived'],axis = 1)
Y = df['survived']
X_sm, y_sm = sm.fit_resample(X, Y)

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_sm.shape}''')

print('\nBalance of positive and negative classes (%):')
y_sm.value_counts(normalize=True) * 100


Shape of X before SMOTE: (182, 23)
Shape of X after SMOTE: (246, 23)

Balance of positive and negative classes (%):


1    50.0
0    50.0
Name: survived, dtype: float64

In [42]:
y_sm.value_counts()

1    123
0    123
Name: survived, dtype: int64

### Split Dataset

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=42)

### Modeling dengan Random Forest

In [44]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

### Metric Evaluation

Karena sudah balanced data, jadi valid menggunakan akurasi

In [45]:
print('Akurasi',accuracy_score(y_test, preds))

Akurasi 1.0


## Random Forest with Imbalance data

### Load Dataset

In [46]:
df = sns.load_dataset('titanic')
df = df.dropna().reset_index(drop=True)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
1,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
2,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
3,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
4,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


### Data Preprocessing

In [47]:
df =pd.get_dummies(data=df,columns=['embarked','sex', 'class', 'who',
                                      'adult_male','deck','embark_town',
                                      'alive','alone'],drop_first=True)
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embarked_Q,embarked_S,sex_male,class_Second,...,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Queenstown,embark_town_Southampton,alive_yes,alone_True
0,1,1,38.0,1,0,71.2833,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,1,1,35.0,1,0,53.1,0,1,0,0,...,0,1,0,0,0,0,0,1,1,0
2,0,1,54.0,0,0,51.8625,0,1,1,0,...,0,0,0,1,0,0,0,1,0,1
3,1,3,4.0,1,1,16.7,0,1,0,0,...,0,0,0,0,0,1,0,1,1,0
4,1,1,58.0,0,0,26.55,0,1,0,0,...,0,1,0,0,0,0,0,1,1,1


In [48]:
df['survived'].value_counts()

1    123
0     59
Name: survived, dtype: int64

### Split Dataset

In [49]:
X = df.drop(['survived'],axis = 1)
Y = df['survived']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

### Modeling denga Random Forest

In [51]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

### Metric Evaluation

Karena sudah tidak balanced data, jadi TIDAK valid menggunakan akurasi<br><br>

gunakan AUC atau F1 score

In [52]:
cm = confusion_matrix(y_test, preds)
cm

array([[17,  1],
       [ 0, 28]], dtype=int64)

In [53]:
# calculate AUC
print('AUC ',roc_auc_score(y_test, preds))


AUC  0.9722222222222222


In [54]:
# calculate F1 Score
print('F1 Score ',f1_score(y_test, preds))


F1 Score  0.9824561403508771


Bagaimana mencari Precision dan Recall?

In [55]:
# calculate Precision
print('Precision ',precision_score(y_test, preds))


Precision  0.9655172413793104


In [56]:
# calculate Recall
print('Recall ',recall_score(y_test, preds))


Recall  1.0
