In [126]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [113]:
#I'd like to remove all nan data, but when i send submission, site demands 418 rows
train = pd.DataFrame(pd.read_csv('train.csv'))
test = pd.DataFrame(pd.read_csv('test.csv'))
test_ids = test['PassengerId']
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['Fare'] = train['Fare'].fillna(train['Fare'].mean())
train['Cabin'] = train['Cabin'].fillna(train['Cabin'].mode()[0])
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Age'] = test['Age'].fillna(train['Age'].mean())
test['Fare'] = test['Fare'].fillna(train['Fare'].mean())
test['Cabin'] = test['Cabin'].fillna(train['Cabin'].mode()[0])
test['Embarked'] = test['Embarked'].fillna(train['Embarked'].mode()[0])
train = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Survived']]
test = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
train['Sex'] = LabelEncoder().fit_transform(train['Sex'])
train['Cabin'] = LabelEncoder().fit_transform(train['Cabin'])
train['Embarked'] = LabelEncoder().fit_transform(train['Embarked'])
test['Sex'] = LabelEncoder().fit_transform(test['Sex'])
test['Cabin'] = LabelEncoder().fit_transform(test['Cabin'])
test['Embarked'] = LabelEncoder().fit_transform(test['Embarked'])
x_train = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
y_train = train['Survived']
test = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]

In [105]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_depth': 10, 'n_estimators': 200}
0.8417864540832338


In [115]:
clf = RandomForestClassifier(max_depth=10, n_estimators=20)
clf.fit(x_train, y_train)
pred = clf.predict(test)
output = pd.DataFrame({'PassengerId': test_ids,'Survived': pred})
output.to_csv('submission_randomforest.csv', index=False)

In [121]:
param_grid = {
    'max_depth': [3, 5, 7, 9, 11],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200, 300, 400, 500]
}
grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
0.8395518172117257


In [124]:
clf = XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=500)
clf.fit(x_train, y_train)
pred = clf.predict(test)
output = pd.DataFrame({'PassengerId': test_ids,'Survived': pred})
output.to_csv('submission_xgboost.csv', index=False)

In [127]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto',0.0001, 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.7923608059757705


In [128]:
clf = SVC(C=100, gamma=0.001, kernel='rbf')
clf.fit(x_train, y_train)
pred = clf.predict(test)
output = pd.DataFrame({'PassengerId': test_ids,'Survived': pred})
output.to_csv('submission_svc.csv', index=False)

So, I chose 3 models and trained them. You can see the result in the screenshot
![alt text](image.png)