In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [16]:
X_full = pd.read_csv("train.csv")
X_test_full = pd.read_csv("test.csv")
X_all = pd.read_csv("train.csv")

X_full['Age'] = X_full.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

X_full.dropna(axis=0, subset=['Survived'], inplace=True)
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.7, test_size=0.3, random_state=3)

categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [17]:
X_train.head()

Unnamed: 0,Sex,Embarked,PassengerId,Pclass,Age,SibSp,Parch,Fare
233,female,S,234,3,5.0,4,2,31.3875
229,female,S,230,3,21.5,3,1,25.4667
356,female,S,357,1,22.0,0,1,55.0
439,male,S,440,2,31.0,0,0,10.5
637,male,S,638,2,31.0,1,1,26.25


In [18]:
X_train.isna().sum()

Sex            0
Embarked       1
PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
dtype: int64

In [19]:
X_all.groupby(['Sex', 'Pclass']).median()['Age']

Sex     Pclass
female  1         35.0
        2         28.0
        3         21.5
male    1         40.0
        2         30.0
        3         25.0
Name: Age, dtype: float64

In [20]:
X_all_corr = X_all.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
X_all_corr[X_all_corr['level_0'] == 'Age']

Unnamed: 0,level_0,level_1,0
5,Age,Age,1.0
12,Age,Pclass,0.369226
16,Age,SibSp,0.308247
21,Age,Parch,0.189119
26,Age,Fare,0.096067
31,Age,Survived,0.077221
36,Age,PassengerId,0.036847


In [21]:
X_all_corr[X_all_corr['level_0'] == 'Pclass']

Unnamed: 0,level_0,level_1,0
3,Pclass,Pclass,1.0
7,Pclass,Fare,0.5495
11,Pclass,Age,0.369226
13,Pclass,Survived,0.338481
27,Pclass,SibSp,0.083081
39,Pclass,PassengerId,0.035144
41,Pclass,Parch,0.018443


In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

numerical_transformer = SimpleImputer(strategy='constant') # Maybe remove as it only effects Age column

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [23]:
from datetime import datetime
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [24]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(random_state=3))
])
n_e = np.arange(10)*25+850
#l_r = np.arange(10)*0.01+0.01
params = {
        'model__n_estimators': n_e,
        'model__learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
        'model__min_child_weight': [1, 5, 10],
        'model__gamma': [0.5, 1, 1.5, 2, 5],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0],
        'model__max_depth': [3, 4, 5]
        }
folds=5
param_comb = 5
cv_method = RepeatedStratifiedKFold(n_repeats=3, n_splits=folds, random_state=3)

random_search = RandomizedSearchCV(estimator=my_pipeline, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=2, cv=cv_method, verbose=3, random_state=3 )

start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 15 folds for each of 5 candidates, totalling 75 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   12.5s
[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   30.8s finished



 Time taken: 0 hours 0 minutes and 31.65 seconds.


In [25]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 All results:
{'mean_fit_time': array([0.79024142, 0.57240699, 0.71708752, 0.67949166, 0.87628344]), 'std_fit_time': array([0.26929322, 0.00779442, 0.00654129, 0.01033302, 0.04043245]), 'mean_score_time': array([0.01057181, 0.01223377, 0.01230036, 0.011702  , 0.010171  ]), 'std_score_time': array([0.00202066, 0.00076907, 0.00059479, 0.00092602, 0.00175569]), 'param_model__subsample': masked_array(data=[0.6, 1.0, 1.0, 0.6, 0.8],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_model__n_estimators': masked_array(data=[1050, 900, 1025, 1000, 925],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_model__min_child_weight': masked_array(data=[5, 10, 10, 10, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_model__max_depth': masked_array(data=[3, 3, 4, 4, 5],
             mask=[False, False, False, Fal

In [27]:
preds_test = random_search.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': preds_test})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

## KNeighbors

In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline

pipe_knn = Pipeline([('knn', KNeighborsClassifier())])

params = {'knn__n_neighbors': [5, 10, 15, 20],
         'knn__leaf_size': [15, 30, 45, 60],
         'knn__p': [1, 2, 3]}

my_model = GridSearchCV(pipe_knn, params)

my_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'female'

In [None]:
print(my_model.score(X_train,y_train))

## Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline

pipe_rf = Pipeline([('rf', RandomForestClassifier(n_jobs = 5))])

params = {'rf__bootstrap': [True, False],
         'rf__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
         'rf__max_features': ['auto', 'sqrt'],
         'rf__min_samples_leaf': [1, 2, 4],
         'rf__min_samples_split': [2, 5, 10],
         'rf__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

my_model = GridSearchCV(pipe_rf, params)

my_model.fit(X_train, y_train)

ValueError: could not convert string to float: 'female'

In [None]:
print(my_model.score(X_train,y_train))

## Support Vector Machine
### Linear

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import svm

clf = svm.SVC(kernel='linear')

clf.fit(X_train, y_train)

In [None]:
print(clf.score(X_train,y_train))