<a href="https://colab.research.google.com/github/LeoFernanndes/notebooks/blob/master/titanic_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Titanic Kaggle

## Standard Classifier

In [0]:
import pandas as pd


url = 'https://raw.githubusercontent.com/LeoFernanndes/datasets/master/titanic_%20train.csv' 
titanic_df = pd.read_csv(url)

columns_order = ['PassengerId', 'Name', 'Fare', 'Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Embarked', 'Cabin', 'Ticket', 'Survived']
titanic = titanic_df.reindex(columns_order, axis= 1)

# training columns

colunas = ['Fare', 'Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Embarked', 'Cabin', 'Ticket']


In [0]:
# filling in the gaps

for column in titanic.columns:
  if titanic[column].dtype != 'object':
      titanic[column].fillna(titanic[column].mean(), inplace= True)
  else:
      titanic[column] = titanic[column].astype('str')
      titanic[column].fillna(titanic[column].mode(), inplace= True)


from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
for column in titanic.columns:
  if titanic[column].dtype == 'object':
    titanic[column] = le.fit_transform(titanic[column])


titanic_train = titanic.iloc[0: 500]
titanic_test = titanic.iloc[500: ]


x_train = titanic_train[colunas]
y_train = titanic_train['Survived']


x_test = titanic_test[colunas]
y_test = titanic_test['Survived']

In [9]:
titanic['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [0]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import f1_score, accuracy_score

df_resultados = pd.DataFrame({'PassengerId': titanic_test['PassengerId']})
accuracy_list = [] 
f1_list = []

n = 10
for iter in range(n):
  
  RF_clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False)

  RF_clf.fit(x_train, y_train)
  y_pred = RF_clf.predict(x_test)
  
  accuracy = accuracy_score(y_pred, y_test)
  accuracy_list.append(accuracy)

  f1 = f1_score(y_pred, y_test)
  f1_list.append(f1)

  dataframe = pd.DataFrame({iter: y_pred}, index= titanic_test.index)

  df_resultados = pd.concat([df_resultados, dataframe], axis= 1)

df_resultados['Final'] = df_resultados.drop(['PassengerId'], axis= 1).mode(axis= 1)[0]
df_resultados['Porcentagem'] = df_resultados.drop(['PassengerId'], axis= 1).mean(axis= 1)
accuracy_serie = pd.Series(accuracy_list).sort_values(ascending= True)
f1_serie = pd.Series(f1_list).sort_values(ascending= True)

In [5]:
final_accuracy = accuracy_score(df_resultados['Final'], y_test)
final_f1 = f1_score(df_resultados['Final'], y_test)

final_accuracy, accuracy_serie.mean(), final_f1, f1_serie.mean()

(0.8184143222506394, 0.8196930946291561, 0.750877192982456, 0.7511160460153996)

## Stacking Technique

### Simple stacking classification

In [6]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from xgboost import XGBClassifier
import numpy as np
import warnings

warnings.simplefilter('ignore')

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, x_train, y_train, 
                                              cv=8, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))



3-fold cross validation:

Accuracy: 0.68 (+/- 0.05) [KNN]
Accuracy: 0.79 (+/- 0.05) [Random Forest]
Accuracy: 0.77 (+/- 0.03) [Naive Bayes]
Accuracy: 0.68 (+/- 0.05) [StackingClassifier]


### Probabilities as meta-features

In [7]:
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3, sclf], 
                      ['KNN', 
                       'Random Forest', 
                       'Naive Bayes',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, x_train, y_train, 
                                              cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 

          % (scores.mean(), scores.std(), label))

3-fold cross validation:

Accuracy: 0.63 (+/- 0.03) [KNN]
Accuracy: 0.80 (+/- 0.02) [Random Forest]
Accuracy: 0.78 (+/- 0.02) [Naive Bayes]
Accuracy: 0.63 (+/- 0.03) [StackingClassifier]


## References

Comprehensive data exploration with python

https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python

Stacked Regressions to predict House Prices

https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

A study on Regression applied to the Ames dataset

https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset

Handling imbalanced datasets in machine learning

https://towardsdatascience.com/handling-imbalanced-datasets-in-machine-learning-7a0e84220f28

Part II. Model Evaluation: Cross Validation, Bias and Variance Tradeoff and How to Diagnose Overfitting

https://medium.com/@karenovna.ak/part-ii-evaluating-a-predictive-model-cross-validation-and-bias-and-variance-tradeoff-9874b836cd2e
