In [7]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest, chi2

#### __1. Explore the data__

In [4]:
train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/titanic_train.csv"))
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [151]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [152]:
for feature in train_data:
    print("%s %d %r" %(feature, train_data[feature].nunique(), train_data[feature].dtypes))

PassengerId 891 dtype('int64')
Survived 2 dtype('int64')
Pclass 3 dtype('int64')
Name 891 dtype('O')
Sex 2 dtype('O')
Age 88 dtype('float64')
SibSp 7 dtype('int64')
Parch 7 dtype('int64')
Ticket 681 dtype('O')
Fare 248 dtype('float64')
Cabin 147 dtype('O')
Embarked 3 dtype('O')


#### __2. Selecting features and target variable__

In [153]:
features = list(train_data)
features.remove("Survived")
X = train_data[features]
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [154]:
y = train_data["Survived"]
y.unique()

array([0, 1])

#### __3. Feature Selection__

In [155]:
# manual API operations -> parameters
X = X.drop("PassengerId", axis = 1) #

In [156]:
X.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


In [157]:
# check for numeric features
num_features = []
nonnum_features = []
for feature in X:
    if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
        num_features.append(feature)
    else:
        nonnum_features.append(feature)
        
nonnum_features

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

__3.1 Removing all categorical features with too many unique values (threshold = 10)__

In [158]:
rem_features = []
for feature in nonnum_features:
    if X[feature].unique().size >= 10:
        rem_features.append(feature)
        
rem_features

['Name', 'Ticket', 'Cabin']

In [159]:
for rem_feature in rem_features:
    nonnum_features.remove(rem_feature)

nonnum_features

['Sex', 'Embarked']

In [160]:
X = X.drop(rem_features, axis = 1)
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


__3.2 Feature Selection based on statistical data types__

In [161]:
# caterorical y, categorical X
# chi2_test = SelectKBest(score_func = chi2)

#### __4. Transforming features to fit the model__

In [162]:
def create_dummies(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df,dummies], axis = 1)
    return df

In [163]:
if len(nonnum_features) != 0:
    X = create_dummies(X, nonnum_features)
    X = X.drop(nonnum_features, axis = 1)
    X = create_dummies(X, "Pclass") # manual API step
    X = X.drop("Pclass", axis = 1)
    num_features.remove("Pclass")
    X

#### __5. Dealing with missing values__

- Iterative Imputer class
- Models each feature as a function of others

In [164]:
# impute using only numerical features
imp = IterativeImputer(max_iter = 10, random_state = 42)
imp.fit(X[num_features])
X[num_features] = imp.transform(X[num_features])

In [165]:
num_features

['Age', 'SibSp', 'Parch', 'Fare']

In [166]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
Age           891 non-null float64
SibSp         891 non-null float64
Parch         891 non-null float64
Fare          891 non-null float64
Sex_female    891 non-null uint8
Sex_male      891 non-null uint8
Embarked_C    891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
Pclass_1      891 non-null uint8
Pclass_2      891 non-null uint8
Pclass_3      891 non-null uint8
dtypes: float64(4), uint8(8)
memory usage: 34.9 KB


#### __6. Building the model__

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [168]:
clf = RandomForestClassifier(n_estimators = 100, max_depth = None) 
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

#### __7. Quantifying the quality of prediction__

In [169]:
y_predict = clf.predict(X_test)

In [170]:
print("Accuracy Score: ", accuracy_score(y_test, y_predict))

Accuracy Score:  0.7937219730941704


#### __8. Submission (Kaggle)__

In [171]:
# test_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/titanic_test.csv"))

In [172]:
# features = list(test_data)
# X = test_data[features]
# 
# X = X.drop("PassengerId", axis = 1)
# 
# num_features = []
# nonnum_features = []
# for feature in X:
#     if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
#         num_features.append(feature)
#     else:
#         nonnum_features.append(feature)
# 
# rem_features = []
# for feature in nonnum_features:
#     if X[feature].unique().size >= 10:
#         rem_features.append(feature)
#         
# for rem_feature in rem_features:
#     nonnum_features.remove(rem_feature)
#     
# X = X.drop(rem_features, axis = 1)
# 
# if len(nonnum_features) != 0:
#     X = create_dummies(X, nonnum_features)
#     X = X.drop(nonnum_features, axis = 1)
#     X = create_dummies(X, "Pclass") # manual API step
#     X = X.drop("Pclass", axis = 1)
#     num_features.remove("Pclass")
#     X
#     
# imp = IterativeImputer(max_iter = 10, random_state = 42)
# imp.fit(X[num_features])
# X[num_features] = imp.transform(X[num_features])

In [173]:
# y_predict = clf.predict(X)

In [177]:
# output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_predict})
# output.describe()

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.375598
std,120.810458,0.484857
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0


In [178]:
# output.to_csv('my_submission.csv', index = False)
print("Successful")

Successful
