## 探索的データ分析

In [None]:
import pandas as pd
import pandas_profiling
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test  = pd.read_csv('../input/titanic/test.csv')
train.profile_report()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.hist(train.loc[train['Survived'] == 0, 'Age'].dropna(), bins=30, alpha=0.5, label='0')
plt.hist(train.loc[train['Survived'] == 1, 'Age'].dropna(), bins=30, alpha=0.5, label='1')
plt.xlabel('Age')
plt.ylabel('count')
plt.legend(title = 'Survived')

In [None]:
sns.countplot(x='SibSp', hue='Survived', data=train)
plt.legend(loc='upper right', title='Survived')

In [None]:
sns.countplot(x='Parch', hue='Survived', data=train)
plt.legend(loc='upper right', title='Survived')

plt.hist(train.loc[train['Survived']==0, 'Fare'].dropna(), range=(0, 250), bins=25, alpha=0.5, label='0')
plt.hist(train.loc[train['Survived']==1, 'Fare'].dropna(), range=(0, 250), bins=25, alpha=0.5, label='1')
plt.xlabel('Fare')
plt.ylabel('count')
plt.legend(title='Survived')
plt.xlim(-5, 250)

In [None]:
sns.countplot(x='Pclass', hue='Survived', data=train)

In [None]:
sns.countplot(x='Sex', hue='Survived', data=train)

In [None]:
sns.countplot(x='Embarked', hue='Survived', data=train)

In [None]:
data = pd.concat([train, test], sort=False)

data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)

data['Embarked'].fillna('S', inplace=True)
data['Embarked'] = data['Embarked'].map({'S':0, 'C': 1, 'Q': 2}).astype(int)

data['Fare'].fillna(np.mean(data['Fare']), inplace=True)

age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
train['FamilySize'] = data['FamilySize'][:len(train)]
test['FamilySize'] = data['FamilySize'][len(train):]
sns.countplot(x='FamilySize', data=train, hue='Survived')

## 機械学習アルゴリズムの学習・予測

In [None]:
delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test  = data[len(train):]

y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test  = test.drop('Survived', axis=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import lightgbm as lgb

clf = RandomForestClassifier(
    n_estimators = 100,
    max_depth=2,
    random_state=0)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, 
                                                      y_train, 
                                                      test_size=0.3, 
                                                      random_state=0, 
                                                      stratify=y_train)
categorical_features = ['Embarked', 'Pclass', 'Sex']

lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval  = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

params = {
    'objective': 'binary'
}

model = lgb.train(params=params,
                  train_set=lgb_train,
                  valid_sets=[lgb_train, lgb_eval],
                  valid_names=['train', 'valid'],
                  num_boost_round=1000,
                  callbacks=[
                      lgb.early_stopping(stopping_rounds=10),
                      lgb.log_evaluation(period=10)
                  ]
                 )

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

In [None]:
y_pred = (y_pred > 0.5).astype(int)

sub = pd.read_csv('../input/titanic/gender_submission.csv')
sub['Survived'] = y_pred
sub.to_csv('submission.csv', index=False)