# [Titanic - Machine Learning from Disaster](https://www.kaggle.com/c/titanic/data)


## Imports

In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from sklearn import linear_model

## Reading data

In [None]:
train_path = '../input/titanic/train.csv'

train_data = pd.read_csv(train_path)
train_data.head(3)

In [None]:
test_path = '../input/titanic/test.csv'

test_data = pd.read_csv(test_path)
test_data.head(3)

In [None]:
print(train_data.info())

In [None]:
test_data.info()

In [None]:
train_data.describe(include=np.object_)

In [None]:
train_data.describe()

In [None]:
test_data.describe(include=np.object_)

In [None]:
train_data.describe()

## Visualization

In [None]:
corr = train_data.corr()
fig, ax = plt.subplots(figsize=(10,10)) 
ax = sns.heatmap(corr, annot=True, linewidth=0.5)
ax.set_title("Correlation matrix")
plt.show()

In [None]:
ax = sns.pairplot(train_data.replace({"Survived": {0:"No", 1:"Yes"}}), hue="Survived", plot_kws={"alpha":0.5}, diag_kind="hist")
ax.fig.suptitle("Pairplots", y=1.08)

## Filling NaN

In [None]:
print(train_data.isnull().sum())
print(40*"=")
print(test_data.isnull().sum())

### Age

In [None]:
ags = []
for i in ['female', 'male']:
    gender_ags = []
    for j in range(1, 4):
        gender_ags.append(train_data.loc[(train_data['Sex'] == i) & (train_data['Pclass'] == j)]['Age'].mean())
    ags.extend([gender_ags])

In [None]:
for i, gender in enumerate(['female', 'male']):
    for j, pclass in enumerate(range(1, 4)):
        train_data.loc[(train_data['Sex'] == gender) & (train_data['Pclass'] == pclass) & (train_data['Age'].isna()), 'Age'] = ags[i][j]

In [None]:
for i, gender in enumerate(['female', 'male']):
    for j, pclass in enumerate(range(1, 4)):
        test_data.loc[(test_data['Sex'] == gender) & (test_data['Pclass'] == pclass) & (test_data['Age'].isna()), 'Age'] = ags[i][j]

### Embarked

In [None]:
train_data.groupby(['Embarked']).size()

In [None]:
train_data['Embarked'].fillna('S', inplace=True)

### Fare

In [None]:
test_data.Fare = test_data.Fare.fillna( train_data.Fare.mean() )

### Cabin

In [None]:
train_data['Cabin']=train_data['Cabin'].fillna(0)
test_data['Cabin']=test_data['Cabin'].fillna(0)

## Feature Engineering

In [None]:
# def get_title(name):
#     title_search = re.search(' ([A-Za-z]+)\.', name)
#     if title_search:
#         return title_search.group(1)
#     return ""

# train_data['title']=train_data['Name'].apply(get_title)
# test_data['title']=test_data['Name'].apply(get_title)

# title_lev1=list(train_data['title'].value_counts().reset_index()['index'])
# title_lev2=list(test_data['title'].value_counts().reset_index()['index'])

# title_lev=list(set().union(title_lev1, title_lev2))
# print(title_lev)

# train_data['title']=pd.Categorical(train_data['title'], categories=title_lev)
# test_data['title']=pd.Categorical(test_data['title'], categories=title_lev)

## Get dummies

In [None]:
training=pd.get_dummies(train_data, columns=["Pclass","Embarked","Sex"])
training.drop(['Sex_female', 'PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

training.head()

In [None]:
testing=pd.get_dummies(test_data, columns=["Pclass","Embarked","Sex"])
testing.drop(['Sex_female', 'PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

testing.head()

In [None]:
X = training.drop(['Survived'], axis=1)
y = training['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Predict

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

y_pred = lr_clf.predict(X_test)
accuracy_score(y_pred, y_test)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy_score(y_pred, y_test)

#### Tuning


In [None]:
rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [10, 30,50, 100,150, 200, 250],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,9],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

In [None]:
CV_rfc.best_params_

In [None]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators=30, max_depth=6, criterion='entropy')
rfc1.fit(X_train, y_train)

y_pred=rfc1.predict(X_test)
accuracy_score(y_test, y_pred)

# Pedict test data use best model

In [None]:
Y_pred = rfc1.predict(testing)

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": Y_pred
    })

In [None]:
submission.to_csv('./submission.csv', index=False)