In [95]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
from sklearn.preprocessing import StandardScaler

def genderResult(sex):
    if (sex == 'male'):
        return 1
    else:
        return 0
    
def embarkedResult(embarked):
    if (embarked == 'S'):
        return 2
    elif (embarked == 'Q'):
        return 1
    else:
        return 0
    
def nameResult(name):
    if ((name.find('Jonkheer') or name.find('Don.') or name.find('Sir') or name.find('Countess') or name.find('Lady')) > -1):
        return 6
    if ((name.find('Capt') or name.find('Col') or name.find('Major') or name.find('Dr') or name.find('Rev')) > -1):
        return 5
    if ((name.find('Mrs') or name.find('Ms') or name.find('Mme')) > -1):
        return 4
    elif (name.find('Mr') > -1):
        return 3
    elif (name.find('Master') > -1):
        return 2
    elif ((name.find('Miss') or name.find('Mlle')) > -1):
        return 1
    else:
        return 0
    
def ageResult(age):
    if (embarked == 'S'):
        return 2
    elif (embarked == 'Q'):
        return 1
    else:
        return 0

df_train = pd.read_csv("train.csv", index_col=0)
df_train['Sex'] = df_train['Sex'].apply(genderResult)
df_train['Embarked'] = df_train['Embarked'].apply(embarkedResult)
df_train['Name'] = df_train['Name'].apply(nameResult)
df_train['SizeFamily'] = 1 + df_train['SibSp'] + df_train['Parch']
df_train['Solo'] = [1 if x == 1 else 0 for x in df_train['SizeFamily']]

median_age_men=df_train[df_train['Sex']==1]['Age'].median()
median_age_women=df_train[df_train['Sex']==0]['Age'].median()
df_train['Age'] = [median_age_men if x ==1 else median_age_women for x in df_train['Sex']]

mean_embarked_s=df_train[df_train['Embarked']==2]['Fare'].mean()
mean_embarked_q=df_train[df_train['Embarked']==1]['Fare'].mean()
mean_embarked_c=df_train[df_train['Embarked']==0]['Fare'].mean()
mean_fare = df_train["Fare"].mean()
mean_fare = int(mean_fare)
df_train["Fare"] = df_train["Fare"].fillna(mean_fare)
#df_train['Fare'] = [mean_embarked_s if x ==2 else mean_embarked_q if x ==1 else mean_embarked_c for x in df_train['Embarked']]


X_train = df_train.drop(['Survived','Ticket','Cabin','SibSp','Parch','Age','Name','SizeFamily'], axis=1)
y_train = df_train['Survived']

df_test = pd.read_csv("test.csv", index_col=0)
df_test['Sex'] = df_test['Sex'].apply(genderResult)
df_test['Embarked'] = df_test['Embarked'].apply(embarkedResult)
df_test['Name'] = df_test['Name'].apply(nameResult)
df_test['SizeFamily'] = 1 + df_test['SibSp'] + df_test['Parch']
df_test['Solo'] = [1 if x == 1 else 0 for x in df_test['SizeFamily']]
median_age_men=df_test[df_test['Sex']==1]['Age'].median()
median_age_women=df_test[df_test['Sex']==0]['Age'].median()
df_test['Age'] = [median_age_men if x ==1 else median_age_women for x in df_test['Sex']]
mean_fare = df_test["Fare"].mean()
mean_fare = int(mean_fare)
mean_embarked_s=df_test[df_test['Embarked']==2]['Fare'].mean()
mean_embarked_q=df_test[df_test['Embarked']==1]['Fare'].mean()
mean_embarked_c=df_test[df_test['Embarked']==0]['Fare'].mean()
#df_test['Fare'] = [mean_embarked_s if x ==2 else mean_embarked_q if x ==1 else mean_embarked_c for x in df_test['Embarked']]

df_test["Fare"] = df_test["Fare"].fillna(mean_fare)
X_test = df_test.drop(['Ticket','Cabin','SibSp','Parch','Age','Name','SizeFamily'], axis=1)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


# print out columns with nan
df_test.isna().any()


Pclass        False
Name          False
Sex           False
Age           False
SibSp         False
Parch         False
Ticket        False
Fare          False
Cabin          True
Embarked      False
SizeFamily    False
Solo          False
dtype: bool

In [2]:
def makeSubmissionFile(y_pred):
    fh = open("submission.csv", "w")
    fh.write("PassengerId,Survived\n")
    for i in range (len(y_pred)):
        pid = 892 + i
        fh.write(str(pid))
        fh.write(',')
        fh.write(str(y_pred[i]))
        fh.write('\n')
    fh.close()

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'n_estimators': [50, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2,4,5,6,7,8,10],
    'criterion' :['gini', 'entropy']
}

gridcv = GridSearchCV(RandomForestClassifier(random_state=10), param_grid=param_grid, cv=5, n_jobs=-1)
gridcv.fit(X_train, y_train)
print('grid search config: %s' % gridcv.best_params_)

grid search config: {'criterion': 'entropy', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 400}


In [32]:
clf = RandomForestClassifier(n_estimators=400, criterion='entropy', max_depth=4, max_features='auto')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
makeSubmissionFile(y_pred)

In [73]:
from sklearn.ensemble import GradientBoostingClassifier
params = {
    "learning_rate": [0.01, 0.025, 0.05, 0.075],
    "max_depth":[2,4],
    "n_estimators":[10,50,100],
    "criterion": ["friedman_mse",  "mae"],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12)
    }

clf = GridSearchCV(GradientBoostingClassifier(random_state=10), params, cv=10, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)

{'criterion': 'friedman_mse', 'learning_rate': 0.075, 'max_depth': 4, 'min_samples_leaf': 0.13636363636363638, 'min_samples_split': 0.31818181818181823, 'n_estimators': 100}
0.8092259675405742


In [85]:
gradient = GradientBoostingClassifier(criterion='friedman_mse', n_estimators=100, max_depth=4, learning_rate=0.025, min_samples_leaf=0.1, min_samples_split=0.1)
gradient.fit(X_train, y_train)
y_pred = gradient.predict(X_test)
makeSubmissionFile(y_pred)

In [71]:
from sklearn.ensemble import AdaBoostClassifier
params = {
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "n_estimators":[10,50,100,150, 300]
    }

clf = GridSearchCV(AdaBoostClassifier(random_state=10), params, cv=10, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)

{'learning_rate': 0.2, 'n_estimators': 300}
0.8204619225967541


In [72]:
ada = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
makeSubmissionFile(y_pred)

In [34]:
from sklearn.svm import SVC
svc = SVC(kernel = 'rbf', random_state = 0)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
makeSubmissionFile(y_pred)

In [36]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty="l1", solver='liblinear')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
makeSubmissionFile(y_pred)


In [79]:
from catboost import CatBoostClassifier
params = {'iterations': [100,200,300,500,600,700,800],
          'depth': [2,3, 4, 5, 6],
          'loss_function': ['Logloss', 'CrossEntropy'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [5, 10, 15],
          'logging_level':['Silent']
         }
clf = GridSearchCV(CatBoostClassifier(), params, cv=10, n_jobs=-1, scoring="accuracy")
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)


{'depth': 3, 'iterations': 100, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 5, 'logging_level': 'Silent', 'loss_function': 'Logloss'}
0.7868039950062422


In [93]:
cat = CatBoostClassifier(depth = 2, iterations = 300, l2_leaf_reg = 1e-20, leaf_estimation_iterations = 10, logging_level = 'Silent', loss_function = 'Logloss', random_seed=42)
cat.fit(X_train, y_train)
y_pred = cat.predict(X_test)
makeSubmissionFile(y_pred)


In [87]:
from xgboost import XGBClassifier
params     = {"subsample":[0.75, 1],
              "colsample_bytree":[0.75, 1],
              "max_depth":[4, 5, 6],
              "min_child_weight":[1, 5],
              "n_estimators":[100,300],
              "learning_rate": [0.01, 0.05, 0.1]}
clf = GridSearchCV(XGBClassifier(), params, cv=5, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)



{'colsample_bytree': 0.75, 'learning_rate': 0.01, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1}
0.8260372857949909


In [9]:
xgb = XGBClassifier(colsample_bytree = 1, learning_rate = 0.01, max_depth = 6, min_child_weight = 1, n_estimators = 100, subsample = 1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
makeSubmissionFile(y_pred)
                    
                    



In [29]:
from sklearn.tree import DecisionTreeClassifier
params = {'criterion':['gini','entropy'],
          'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}

clf = GridSearchCV(DecisionTreeClassifier(), params, cv=5, n_jobs=-1)
clf.fit(X_train, y_train)
print(clf.best_params_)

{'criterion': 'entropy', 'max_depth': 6}


In [30]:
decision_tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=6)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
makeSubmissionFile(y_pred)

In [82]:
correlation = df_train.corr()
correlation_target = abs(correlation["Survived"])
print(correlation_target)

Survived      1.000000
Pclass        0.338481
Name          0.159986
Sex           0.543351
Age           0.543351
SibSp         0.035322
Parch         0.081629
Fare          0.257307
Embarked      0.174199
SizeFamily    0.016639
Solo          0.203367
Name: Survived, dtype: float64
