# Import Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

# Import Dataset

In [3]:
train = pd.read_csv('train.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
train.head(5)

In [None]:
train.set_index('PassengerId', inplace=True)

In [None]:
train.info()

In [None]:
train.describe()

# Handling Missing Values

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(train.isnull())

In [None]:
train.isnull().sum()

In [None]:
categorical_columns = [col for col in train.columns if train[col].dtype == 'O']
numerical_columns = [col for col in train.columns if train[col].dtype != 'O']

In [None]:
categorical_with_na = [col for col in categorical_columns if train[col].isnull().sum() > 0]
numerical_with_na = [col for col in numerical_columns if train[col].isnull().sum() > 0]

In [None]:
numerical_with_na

In [None]:
categorical_with_na

In [None]:
sns.pairplot(train)

## 1) Drop row / column

In [None]:
train_drop = train.copy()

In [None]:
train_drop.Embarked.dropna(axis=0, inplace=True) # column Embarked has only 2 rows with missing values

In [None]:
train_drop.drop(['Cabin'], axis=1, inplace=True) # column Cabin has more than 70% data missing

## 2) Mean/Median Imputation

In [None]:
train['Age'].mean()

In [None]:
fig, axes = plt.subplots(1,1, figsize=(7, 7))
sns.set_style('whitegrid')
sns.kdeplot(train.Age)

In [None]:
train_mean = train.copy()
train_mean['Age'] = train.Age.fillna(train.Age.mean())

In [None]:
train_mode = train.copy()
train_mode['Age'] = train.Age.fillna(train.Age.median())

In [None]:
fig, axes = plt.subplots(1,1, figsize=(7, 7))
sns.set_style('whitegrid')
sns.kdeplot(train.Age, label='Original')
sns.kdeplot(train_mean.Age, label='Mean')
sns.kdeplot(train_mode.Age, label='Median')
axes.legend()

## 3) Most Frequent Imputation

In [None]:
train['Embarked'].unique()

In [None]:
sns.countplot(x='Embarked', data=train)

In [None]:
train_frequent = train.copy()

In [None]:
train_frequent.Embarked.fillna('S', inplace=True)

## 4) Unknown Value Imputation

In [None]:
train_unknown = train.copy()

In [None]:
train_unknown.Cabin.unique()

In [None]:
train_unknown.Cabin.fillna('U', inplace=True)
# train_unknown.Cabin = train_unknown.apply(replace_with_k, axis=1)
train_unknown.Cabin = np.where(train_unknown.Cabin != 'U', 'K', 'U')

In [None]:
train_unknown.Cabin.value_counts()

In [None]:
fig = plt.figure(figsize=(12,7))
sns.countplot(x='Survived', data=train_unknown, hue='Cabin')

##### Note:  After replacing know values with 'K' and unknown values with 'U' we can see that more people that didnt survive have unknown cabin type

### 5) Prediction of Missing Values

In [None]:
train.columns

In [None]:
X_train_age = train[['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket','Fare', 'Cabin','Embarked', 'Age']].copy()
y_train_age = train[['Age']].copy()

In [None]:
for column in X_train_age:
    if X_train_age[column].isnull().sum() > 0 and column != 'Age':
        X_train_age.drop([column], axis=1, inplace=True)

In [None]:
X_train_age.dropna(axis=0, inplace=True)
X_train_age.drop(['Age'], axis=1, inplace=True)

In [None]:
y_train_age.dropna(axis=0, inplace=True)

In [None]:
X_train_age.shape

In [None]:
y_train_age.shape

In [None]:
X_train_age.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [None]:
y_pred = train.loc[train.Age.isnull(), X_train_age.columns]

In [None]:
X_train_age

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), ['Sex'])], remainder='passthrough')

X_train_age = ct.fit_transform(X_train_age)

y_pred = ct.transform(y_pred)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_age[:, 1:] = scaler.fit_transform(X_train_age[:, 1:])
y_pred[:, 1:] = scaler.fit_transform(y_pred[:, 1:])

y_train_age = y_scaler.fit_transform(y_train_age)

In [None]:
import xgboost as xgb

age_regressor = xgb.XGBRegressor()

In [None]:
age_regressor.fit(X_train_age, y_train_age)

In [None]:
null_age = y_scaler.inverse_transform(age_regressor.predict(y_pred))

In [None]:
train_pred_age = train.copy()    

In [None]:
null_age_id = train.loc[train.Age.isnull(), :].index
train_pred_age.loc[train.loc[train.Age.isnull(), :].index, 'Age'] = null_age

In [None]:
fig, axes = plt.subplots(1,1, figsize=(7, 7))
sns.set_style('whitegrid')
sns.kdeplot(train.Age, label='Original')
sns.kdeplot(train_pred_age.Age, label='Prediction')
axes.legend()

In [None]:
train.fillna(method=)

## Actual Problem Solving

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.set_index('PassengerId', inplace=True)
test.set_index('PassengerId', inplace=True)

In [None]:
train.drop(['Name', 'Ticket'], axis=1, inplace=True)
test.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(train.isnull())

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(test.isnull())

### Embarked

In [None]:
train.Embarked.fillna('S', inplace=True)

### Fare

In [None]:
test.Fare.fillna(np.mean(test.Fare), inplace=True)

### Cabin

In [None]:
train.drop(['Cabin'], axis=1, inplace=True)
test.drop(['Cabin'], axis=1, inplace=True)

In [None]:
train.Cabin.fillna('U', inplace=True)
train.Cabin = np.where(train.Cabin != 'U', 'K', 'U')

In [None]:
test.Cabin.fillna('U', inplace=True)
test.Cabin = np.where(test.Cabin != 'U', 'K', 'U')

### Age

In [None]:
train.Age.fillna(train.Age.mean(), inplace=True)
test.Age.fillna(test.Age.mean(), inplace=True)

In [None]:
column = ['Pclass', 'Sex', 'SibSp', 'Parch','Fare', 'Cabin','Embarked', 'Age']

In [None]:
X_train_age = train[['Pclass', 'Sex', 'SibSp', 'Parch','Fare', 'Cabin','Embarked', 'Age']].copy()
y_train_age = train[['Age']].copy()
X_test_train_age = test[['Pclass', 'Sex', 'SibSp', 'Parch','Fare', 'Cabin','Embarked', 'Age']].copy()
y_test_train_age = test[['Age']].copy()

In [None]:
for column in X_train_age:
    if X_train_age[column].isnull().sum() > 0 and column != 'Age':
        X_train_age.drop([column], axis=1, inplace=True)

for column in X_test_train_age:
    if X_test_train_age[column].isnull().sum() > 0 and column != 'Age':
        X_test_train_age.drop([column], axis=1, inplace=True)

In [None]:
X_train_age.dropna(axis=0, inplace=True)
X_train_age.drop(['Age'], axis=1, inplace=True)

X_test_train_age.dropna(axis=0, inplace=True)
X_test_train_age.drop(['Age'], axis=1, inplace=True)

In [None]:
y_train_age.dropna(axis=0, inplace=True)
y_test_train_age.dropna(axis=0, inplace=True)

In [None]:
age_X_train = pd.concat([X_train_age, X_test_train_age])

In [None]:
age_y_train = pd.concat([y_train_age, y_test_train_age])

In [None]:
y_train_pred = train.loc[train.Age.isnull(), X_train_age.columns]
y_test_pred = test.loc[test.Age.isnull(), X_test_train_age.columns]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

age_ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), ['Sex', 'Cabin', 'Embarked'])], remainder='passthrough')

age_X_train = age_ct.fit_transform(age_X_train)

y_train_pred = age_ct.transform(y_train_pred)
y_test_pred = age_ct.transform(y_test_pred)

In [None]:
from sklearn.preprocessing import StandardScaler

age_scaler = StandardScaler()
age_y_scaler = StandardScaler()

age_X_train = age_scaler.fit_transform(age_X_train)
y_test_pred = age_scaler.transform(y_test_pred)
y_train_pred = age_scaler.transform(y_train_pred)

age_y_train = age_y_scaler.fit_transform(age_y_train)

In [None]:
import xgboost as xgb

age_regressor = xgb.XGBRegressor()

In [None]:
age_regressor.fit(age_X_train, age_y_train)

In [None]:
train.loc[train.loc[train.Age.isnull(), :].index, 'Age'] =  age_y_scaler.inverse_transform(age_regressor.predict(y_train_pred))
test.loc[test.loc[test.Age.isnull(), :].index, 'Age'] =  age_y_scaler.inverse_transform(age_regressor.predict(y_test_pred))

## One hot Encoding

In [None]:
X_train = train.drop(['Survived'], axis=1).copy()
y_train = train['Survived'].copy()

In [None]:
X_test = test.copy()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), ['Sex', 'Embarked'])], remainder='passthrough')

X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Classification

In [None]:
import xgboost as xgb

classifier = xgb.XGBClassifier()

In [None]:
classifier.fit(X_train, np.array(y_train))

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test.index, 'Survived': y_pred})

In [None]:
output.set_index('PassengerId', inplace=True)

In [None]:
import datetime
ts = datetime.datetime.now().timestamp()
output.to_csv('Output/' + str(ts) + '.csv')

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

x_train_1, x_valid_1, y_train_1, y_valid_1 = train_test_split(X_train, 
                                                      np.array(y_train), 
                                                      test_size=.2, 
                                                      random_state=1)

clf = xgb.XGBClassifier(random_state=1, verbosity=0, use_label_encoder=False)
hyperparameters = {
    'n_estimators': [1000],
    'learning_rate': [.1],
    'max_depth': [2, 3, 4, 8]
                   }

fit_params={'early_stopping_rounds':15, 
            'eval_metric': 'error', 
            'eval_set':[[x_valid_1, y_valid_1]]}

gs = GridSearchCV(clf, param_grid=hyperparameters, verbose=0, cv=10)
gs.fit(x_train_1, y_train_1, **fit_params, verbose=0)

best_params = gs.best_params_
best_score = gs.best_score_

print(best_params) 
print(best_score)

In [None]:
xgboost = xgb.XGBClassifier(random_state = 1, 
                            n_estimators = 1000, 
                            learning_rate = .1,
                            verbosity = 0, 
                            use_label_encoder = False,
                            objective = 'binary:logistic',
                            max_depth = 4)

In [None]:
xgboost.fit(x_train_1, y_train_1, 
            early_stopping_rounds = 5, 
            eval_metric = 'error', 
            eval_set = [[x_valid_1, y_valid_1]], verbose = 0)

In [None]:
y_pred_2 = xgboost.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test.index, 'Survived': y_pred})

In [None]:
output.set_index('PassengerId', inplace=True)

In [None]:
import datetime
ts = datetime.datetime.now().timestamp()
output.to_csv('Output/' + str(ts) + '.csv')

# Other Ideas

In [None]:
train['Embarked'].unique()

In [None]:
def compute_age_category(row):
    
    age = row.Age
    category = None
    
    if age < 10:
        category = 'infant'
    elif age < 20:
        category = 'teen'
    elif age < 55:
        category = 'adult'
    else:
        category = 'old'
    
    return category

In [None]:
train_embarked = train_pred_age.copy()

In [None]:
train_embarked['Age_class'] = train_pred_age.apply(compute_age_category, axis=1)

In [None]:
sns.countplot(x='Embarked', data=train_embarked, hue='Age_class')

In [None]:
train_embarked[train_embarked.Embarked.isnull()]

In [None]:
train_embarked.loc[62, 'Embarked'] = 'S'
train_embarked.loc[830, 'Embarked'] = 'S'

In [None]:
sns.scatterplot(x='Fare', y='Fare', data=train_embarked, hue='Embarked')

In [None]:
train[(train_embarked.Fare > 65) & (train_embarked.Fare < 95)].groupby('Embarked').size()

In [None]:
train[(train_embarked.Fare > 79) & (train_embarked.Fare < 81)].groupby('Embarked').size()