# Importing Libraries

In [184]:
# sklearn for ML
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

# computation libraries used
import pandas as pd
import numpy as np

#### graphing libraries ####
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Processing DataSet

In [185]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

### Creating Classes for Age attribute

In [186]:
def process_age(df, cuts, label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_Categories"] = pd.cut(df["Age"], cuts, labels=label_names)
    return df

In [187]:
cuts = [-1, 0, 5, 12, 18, 35, 60, 100]
labels = ["Missing", "Infant", "Child", "Teenager", "Young Adult", "Adult", "Senior"]

In [188]:
train = process_age(train, cuts, labels)
test = process_age(test, cuts, labels)

### Creating dummies

In [189]:
def create_dummies(df, col):
    dummies = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, dummies], axis=1)
    return df

In [190]:
train = create_dummies(train, "Pclass")
test = create_dummies(test, "Pclass")

In [191]:
train = create_dummies(train, "Sex")
test = create_dummies(test, "Sex")

In [192]:
train = create_dummies(train, "Age_Categories")
test = create_dummies(test, "Age_Categories")

In [193]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 25 columns):
PassengerId                   891 non-null int64
Survived                      891 non-null int64
Pclass                        891 non-null int64
Name                          891 non-null object
Sex                           891 non-null object
Age                           891 non-null float64
SibSp                         891 non-null int64
Parch                         891 non-null int64
Ticket                        891 non-null object
Fare                          891 non-null float64
Cabin                         204 non-null object
Embarked                      889 non-null object
Age_Categories                891 non-null category
Pclass_1                      891 non-null uint8
Pclass_2                      891 non-null uint8
Pclass_3                      891 non-null uint8
Sex_female                    891 non-null uint8
Sex_male                      891 non-null uint8
A

# Preparing Dataset

In [194]:
cols = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Age_Categories_Missing','Age_Categories_Infant', 'Age_Categories_Child', 'Age_Categories_Teenager', 'Age_Categories_Young Adult', 'Age_Categories_Adult', 'Age_Categories_Senior']

X = train[cols]
y = train["Survived"]

### Splitting into train and test

#### Using StratisfiedShuffleSplit

In [195]:
strat_split = StratifiedShuffleSplit(n_splits=1, train_size=0.9, random_state=42)

for train_index, test_index in strat_split.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]



In [196]:
strat_split_val = StratifiedShuffleSplit(n_splits=1, train_size=0.9, random_state=42)

for train_index, val_index in strat_split.split(X_train, y_train):
    X_train, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]



#### Using test_train_split

In [197]:
X_train, X_test, y_train, y_test = train_test_split(train[cols], train["Survived"], test_size=0.1, random_state=0)

### Scaling

In [198]:
scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)

# Fitting Models

## K Nearest Neighbours

In [199]:
knn = KNeighborsClassifier(n_jobs=-1)

In [200]:
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform')

In [201]:
print(accuracy_score(y_val, knn.predict(X_val_scaled)))
print(confusion_matrix(y_val, knn.predict(X_val_scaled)))

0.8024691358024691
[[47  3]
 [13 18]]


## Logistic Regression

In [202]:
log_reg = LogisticRegression(n_jobs=-1)

In [203]:
log_reg.fit(X_train, y_train)

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [204]:
print(accuracy_score(y_val, log_reg.predict(X_val)))
print(confusion_matrix(y_val, log_reg.predict(X_val)))

0.7901234567901234
[[46  4]
 [13 18]]


### SGD Classifier

In [205]:
sgd_cls = SGDClassifier(n_jobs=-1)

In [206]:
sgd_cls.fit(X_train_scaled, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=-1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [207]:
print(accuracy_score(y_val, sgd_cls.predict(X_val_scaled)))
print(confusion_matrix(y_val, sgd_cls.predict(X_val_scaled)))

0.7037037037037037
[[44  6]
 [18 13]]


### SVC

In [208]:
svc_cls = SVC()

In [209]:
svc_cls.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [210]:
print(accuracy_score(y_val, svc_cls.predict(X_val_scaled)))
print(confusion_matrix(y_val, svc_cls.predict(X_val_scaled)))

0.8271604938271605
[[48  2]
 [12 19]]


### Decision Tree Classifier

In [211]:
tree = DecisionTreeClassifier()

In [212]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [213]:
print(accuracy_score(y_val, tree.predict(X_val)))
print(confusion_matrix(y_val, tree.predict(X_val)))

0.8271604938271605
[[48  2]
 [12 19]]


### Random Forest Classifier

In [214]:
rf = RandomForestClassifier(n_jobs=-1, oob_score=True)

In [215]:
rf.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [216]:
print(rf.oob_score_)
print(accuracy_score(y_val, rf.predict(X_val)))
print(confusion_matrix(y_val, rf.predict(X_val)))

0.7940074906367042
0.8024691358024691
[[46  4]
 [12 19]]


### GradientBoosting Classifier

In [217]:
xgb_cls = GradientBoostingClassifier()

In [218]:
xgb_cls.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [219]:
print(accuracy_score(y_val, xgb_cls.predict(X_val)))
print(confusion_matrix(y_val, xgb_cls.predict(X_val)))

0.7901234567901234
[[46  4]
 [13 18]]


### Extremely Randomised Trees

In [220]:
etree = ExtraTreesClassifier(oob_score=True, n_jobs=-1, bootstrap=True)

In [221]:
etree.fit(X_train, y_train)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

In [222]:
print(accuracy_score(y_val, etree.predict(X_val)))
print(etree.oob_score_)
print(confusion_matrix(y_val, etree.predict(X_val)))

0.8148148148148148
0.8002496878901373
[[48  2]
 [13 18]]


# Cross-Validation

### Etree

In [223]:
etree_scores = cross_val_score(etree, X_train, y_train, cv=10, n_jobs=-1)

In [224]:
print(np.mean(etree_scores))
print(np.std(etree_scores))

0.7965924363181747
0.04120667575176045


### GBC

In [225]:
xgb_cls_scores = cross_val_score(xgb_cls, X_train, y_train, cv=10, n_jobs=-1)

In [226]:
print(np.mean(xgb_cls_scores))
print(np.std(xgb_cls_scores))

0.8017031958118457
0.04384728170409986


### LR

In [227]:
log_reg_scores = cross_val_score(log_reg, X_train, y_train, cv=10, n_jobs=-1)

In [228]:
print(np.mean(log_reg_scores))
print(np.std(log_reg_scores))

0.8116410767307393
0.03366393061251184


### RF

In [229]:
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, n_jobs=-1)

In [230]:
print(np.mean(rf_scores))
print(np.std(rf_scores))

0.79917194092827
0.04904239447320048


### DTC

In [234]:
tree_scores = cross_val_score(tree, X_train, y_train, cv=10, n_jobs=-1)

In [236]:
print(np.mean(tree_scores))
print(np.std(tree_scores))

0.8041565088295046
0.038624642292250135


# Saving Predictions

In [231]:
dic = {
    "PassengerId" : test["PassengerId"],
    "Survived" : rf.predict(test[cols])
}

In [232]:
result = pd.DataFrame(dic)

In [233]:
result.to_csv("result_rf.csv", index=False)