<a href="https://colab.research.google.com/github/InkyunMoon/TIL/blob/master/Titanic3_Top_4_with_ensemble_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import io
import pandas as pd
from google.colab import files
uploaded = files.upload()
train = pd.read_csv(io.BytesIO(uploaded['train.csv']))

In [None]:
uploaded = files.upload()
test = pd.read_csv(io.BytesIO(uploaded['test.csv']))

In [None]:
train.head

In [None]:
test.head

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

sns.set(style='white', context='notebook', palette='deep')

In [None]:
def detect_outliers(df, n, features):
  outlier_indices = []
  for col in features:
    Q1 = np.percentile(df[col],25)
    Q3 = np.percentile(df[col],75)
    IQR = Q3-Q1

    outlier_list_col = df[(df[col] < Q1 - 1.5*IQR)|(df[col] > Q3 + 1.5*IQR)].index
    outlier_indices.extend(outlier_list_col)
  outlier_indices = Counter(outlier_indices)
  multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
  
  return multiple_outliers

Outliers_to_drop = detect_outliers(train,2,['Age','SibSp','Parch','Fare'])

In [None]:
train.loc[Outliers_to_drop]

In [None]:
train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

In [None]:
train_len = len(train)
dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

In [None]:
dataset

In [None]:
dataset = dataset.fillna(np.nan)

dataset.isnull().sum()

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train.describe(include='all')

In [None]:
plt.figure(figsize=(12,8))
g = sns.heatmap(train[['Survived','SibSp','Parch','Age','Fare']].corr(),annot=True,fmt='.2f',cmap='coolwarm')

In [None]:
g = sns.factorplot(x='SibSp', y='Survived', data = train, kind='bar',size=6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('survival probability')

In [None]:
g  = sns.factorplot(x="Parch",y="Survived",data=train,kind="bar", size = 6 , 
palette = "muted")
g.despine(left=True)
g = g.set_ylabels("survival probability")

In [None]:
g = sns.FacetGrid(train, col = 'Survived')
g = g.map(sns.distplot, 'Age')

In [None]:
g = sns.kdeplot(train['Age'][(train['Survived'] == 0) & (train['Age'].notnull())], color = 'Red', shade = True)
g = sns.kdeplot(train['Age'][(train['Survived'] == 1) & (train['Age'].notnull())], color = 'blue', shade = True)
g.set_xlabel('Age')
g.set_ylabel('Frequency')
g = g.legend(['Not Survived', 'Survived'])

In [None]:
dataset['Fare'].isnull().sum()

In [None]:
dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())

In [None]:
g = sns.distplot(dataset['Fare'], color = 'magenta', label = 'Skewness : %.2f'%(dataset['Fare'].skew()))
g = g.legend(loc = 'best')

In [None]:
dataset['Fare'] = dataset['Fare'].map(lambda i: np.log(i) if i > 0 else 0)

In [None]:
g = sns.distplot(dataset['Fare'], color='b', label = 'Skewness : %.2f'%(dataset['Fare'].skew()))
g = g.legend(loc='best')

In [None]:
g = sns.barplot(x='Sex', y = 'Survived', data =train)
g = g.set_ylabel('Survival Probability')

In [None]:
train[['Sex', 'Survived']].groupby('Sex').mean()

In [None]:
g = sns.factorplot(x='Pclass',y='Survived', data = train, kind = 'bar', size = 6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Survival Probability')

In [None]:
g = sns.factorplot(x = 'Pclass', y = 'Survived', hue = 'Sex', data = train, size = 6, kind = 'bar', palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Survival probability')

In [None]:
dataset['Embarked'].isnull().sum()

In [None]:
dataset['Embarked'].value_counts()

In [None]:
dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [None]:
g = sns.factorplot(x='Embarked', y = 'Survived', data = train, size = 6, kind = 'bar', palette='muted')
g.despine(left=True)
g = g.set_ylabels('Survival probability')

In [None]:
g = sns.factorplot('Pclass', col = 'Embarked', data=train, size = 6, kind = 'count' ,palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Counts')

In [None]:
g = sns.factorplot('Pclass', 'Survived',col = 'Embarked', hue = 'Sex' ,data=train, size = 6, palette = 'muted')
g.despine(left=True)
g = g.set_ylabels('Counts')