In [232]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from IPython.core.pylabtools import figsize

from pprint import pprint

#ML libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#metrics
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [233]:
#загрузка данных train.scv
data = pd.read_csv(os.getcwd() + '/train.csv')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [234]:
#data.describe()

In [235]:
# 'Age', 'Cabin'(№ каюты), 'Embarked'(порт посадки) - есть пустые значения (missing values)
# 'Sex' - нужно перекодировать 0-1

# какие признаки нужно выкинуть?
## 'Name', 'Ticket', 'Cabin', 'PassengerId' - точно не влияют не выживание
## 'Embarked' - оставлю для варианта №1, возможно есть корреляция с выживаемостью
## 'Embarked' - нужно перекодировать 1-2-3 (для последующей замены на фиктивные переменные)

# 'SibSp' - # братьев и сестер / супругов на борту "Титаника"
# 'Parch' - # родителей / детей на борту "Титаника"
## возможно, их стоит перекодировать в 0-1 
## или заменить одним признаком 'relatives on board' (родственники на борту) 0-1

In [236]:
data = data.drop(['PassengerId', 'Cabin', 'Ticket', 'Name', 'SibSp', 'Parch'], axis = 1)

In [237]:
#кодируем категорильные переменные целочисленными признаками
def embarked_encode(string):
    if string == 'C':
        return 1
    elif string == 'Q':
        return 2
    elif string == 'S':
        return 3
    else:
        return 1

data['Embarked'] = [embarked_encode(x) for x in data['Embarked']]
data['Sex'] = [0 if x == 'female' else 1 for x in data['Sex']]
data['Age'] = [data['Age'].mean() if np.isnan(x) else x for x in data['Age']]
#data['SibSp'] = [1 if x > 0 else 0 for x in data['SibSp']]
#data['Parch'] = [1 if x > 0 else 0 for x in data['Parch']]

In [238]:
#for column in X_features.columns:
#    X_features.hist(column)
data

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,1,22.000000,7.2500,3
1,1,1,0,38.000000,71.2833,1
2,1,3,0,26.000000,7.9250,3
3,1,1,0,35.000000,53.1000,3
4,0,3,1,35.000000,8.0500,3
...,...,...,...,...,...,...
886,0,2,1,27.000000,13.0000,3
887,1,1,0,19.000000,30.0000,3
888,0,3,0,29.699118,23.4500,3
889,1,1,1,26.000000,30.0000,1


In [239]:
#посмотрим корреляцию
corr_data = data.corr()['Survived'].sort_values()
corr_data

Sex        -0.543351
Pclass     -0.338481
Embarked   -0.174199
Age        -0.069809
Fare        0.257307
Survived    1.000000
Name: Survived, dtype: float64

In [240]:
corr_matrix = data.corr(method = 'pearson')
corr_matrix

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
Survived,1.0,-0.338481,-0.543351,-0.069809,0.257307,-0.174199
Pclass,-0.338481,1.0,0.1319,-0.331339,-0.5495,0.170334
Sex,-0.543351,0.1319,1.0,0.084153,-0.182333,0.115513
Age,-0.069809,-0.331339,0.084153,1.0,0.091566,-0.035479
Fare,0.257307,-0.5495,-0.182333,0.091566,1.0,-0.229304
Embarked,-0.174199,0.170334,0.115513,-0.035479,-0.229304,1.0


In [241]:
data = data.drop(['Embarked'], axis = 1)

In [242]:
#еще попробуем убрать проблему коллинеарности в переменной 'Pclass'
# преобразуем все категориальные переменные в тип данных 'category' 
data['Pclass'] = data['Pclass'].astype('category')
#data['Embarked'] = data['Embarked'].astype('category')
# get dummies for all features but remove the first one(avoid multicollinearity) also called Dummy Variable Trap
data_encoded = pd.get_dummies(data,drop_first= True)

In [243]:
data_encoded

Unnamed: 0,Survived,Sex,Age,Fare,Pclass_2,Pclass_3
0,0,1,22.000000,7.2500,0,1
1,1,0,38.000000,71.2833,0,0
2,1,0,26.000000,7.9250,0,1
3,1,0,35.000000,53.1000,0,0
4,0,1,35.000000,8.0500,0,1
...,...,...,...,...,...,...
886,0,1,27.000000,13.0000,1,0
887,1,0,19.000000,30.0000,0,0
888,0,0,29.699118,23.4500,0,1
889,1,1,26.000000,30.0000,0,0


In [244]:
#разделим массив
target = data_encoded['Survived']
#X_features = data_encoded.drop(['Survived', 'Age', 'Fare'], axis = 1)
X_features = data_encoded.drop(['Survived'], axis = 1)

In [245]:
train_X, test_X, train_y, test_y = train_test_split(X_features, target, 
                                                    test_size=0.2, shuffle = True, random_state=1)

In [246]:
#scalling features
#scaler = StandardScaler()
scaler = MinMaxScaler()
#scaler.fit(train_X)
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

In [194]:
#i select the number of main components to achieve maximum 'roc_auc_score' metric, it turned out "5"
#pca = PCA(n_components=5)
#pca.fit(train_X_scaled)
#train_X_scaled_pca = pca.fit_transform(train_X_scaled)
#test_X_scaled_pca = pca.transform(test_X_scaled)

-----------XGBC----------

In [247]:
import xgboost as xgb
from sklearn import  metrics
from xgboost.sklearn import XGBClassifier  
from sklearn.model_selection import GridSearchCV, cross_val_score
import matplotlib.pylab as plt 


def modelMetrics(clf, train_x, train_y, test_x, test_y, isCv = True, cv_folds = 5, early_stopping_rounds = 50):  
    if isCv:  
        xgb_param = clf.get_xgb_params()  
        xgtrain = xgb.DMatrix(train_x,label=train_y)  
        cvresult = xgb.cv(xgb_param,xgtrain,num_boost_round=clf.get_params()['n_estimators'],nfold=cv_folds,  
                          metrics='auc',early_stopping_rounds=early_stopping_rounds)# Показать ли текущее количество деревьев
        clf.set_params(n_estimators=cvresult.shape[0])
        
        print(f'n_estimators = {cvresult.shape[0]}')
        #print(cvresult)
  

    clf.fit(train_x,train_y,eval_metric='auc')  
  
    # Прогноз train
    train_predictions = clf.predict(train_x)  
    train_predprob = clf.predict_proba(train_x)[:,1]# 1 Вероятность
  
    # Печать 
    print("\nModel Report")  
    print("Accuracy : %.4g" % metrics.accuracy_score(train_y, train_predictions))  
    print("AUC Score (Train): %f" % metrics.roc_auc_score(train_y, train_predprob))  
    
    
    # Прогноз test
    test_predictions = clf.predict(test_x)  
    test_predprob = clf.predict_proba(test_x)[:,1]# 1 Вероятность
  
    # Печать
    print("\nModel Report")  
    print("Accuracy : %.4g" % metrics.accuracy_score(test_y, test_predictions))  
    print("AUC Score (Test): %f" % metrics.roc_auc_score(test_y, test_predprob)) 

    #feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)  
    #feat_imp.plot(kind='bar',title='Feature importance')  
    #plt.ylabel('Feature Importance Score')

In [259]:
xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=1000,
                     max_depth=9,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)  

modelMetrics(xgb1, train_X_scaled, train_y, test_X_scaled, test_y)

n_estimators = 11

Model Report
Accuracy : 0.882
AUC Score (Train): 0.936413

Model Report
Accuracy : 0.7765
AUC Score (Test): 0.852869


In [275]:
#настройка параметров
#шаг №2 
param_test1 = {
    'max_depth':range(3,10,1),
    'min_child_weight':range(1,6,1)
} #{'max_depth': 7, 'min_child_weight': 1}, 0.8683

#шаг №3
param_test2 = {  
    'gamma': [i / 10.0 for i in range(0, 10)]  
}#{'gamma': 0.8}, 0.8744278387364565

#шаг №4
param_test3 = {  
    'subsample': [i / 10.0 for i in range(6, 20)],  
    'colsample_bytree': [i / 10.0 for i in range(6, 20)]  
}#{'colsample_bytree': 0.6, 'subsample': 0.9}, 0.8755992321990307

#шаг №5
param_test4 = {  
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]  
} #{'reg_alpha': 0.1}, 0.8766941061023635

#шаг №6
param_test5 = {  
    'reg_lambda': [0.2, 0.3, 0.4, 0.5, 0.6,0.7, 0.8, 0.9, 1.0, 1.1,1.2,1.3]  
} 

gsearch1 = GridSearchCV(estimator=XGBClassifier(
                                                learning_rate =0.1, 
                                                n_estimators=20,
                                                max_depth=9,
                                                min_child_weight=1,
                                                gamma=0.9, 
                                                subsample=0.6,
                                                colsample_bytree=0.9,
                                                objective= 'binary:logistic', 
                                                nthread=4,
                                                scale_pos_weight=1, 
                                                seed=27,
                                                reg_alpha=0.01,
                                                reg_lambda=1.0
                                            ),
                        param_grid=param_test5,
                        scoring='roc_auc',
                        #iid=False,
                        cv=5)

gsearch1.fit(train_X_scaled, train_y)
pprint(gsearch1.cv_results_)
gsearch1.best_params_, gsearch1.best_score_





{'mean_fit_time': array([0.03968835, 0.03108201, 0.03304772, 0.0347106 , 0.03105974,
       0.04123697, 0.02794967, 0.03563452, 0.02780252, 0.03357401,
       0.0302979 , 0.0298686 ]),
 'mean_score_time': array([0.00325737, 0.00296354, 0.0031168 , 0.00335274, 0.00330024,
       0.004005  , 0.00295744, 0.00320272, 0.00306182, 0.00329518,
       0.00407677, 0.00359025]),
 'mean_test_score': array([0.86546951, 0.86396077, 0.86669762, 0.8656293 , 0.86791136,
       0.86751534, 0.86690812, 0.86624935, 0.86898371, 0.86652645,
       0.86818254, 0.86641267]),
 'param_reg_lambda': masked_array(data=[0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2,
                   1.3],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False],
       fill_value='?',
            dtype=object),
 'params': [{'reg_lambda': 0.2},
            {'reg_lambda': 0.3},
            {'reg_lambda': 0.4},
            {'reg_lambda': 0.5},
            {'re

({'reg_lambda': 1.0}, 0.8689837121122895)

In [292]:
#применим теперь полученные параметры

xgb1 = XGBClassifier(
                        learning_rate =0.1, 
                        n_estimators=100,
                        max_depth=9,
                        min_child_weight=1,
                        gamma=0.9, 
                        subsample=0.6,
                        colsample_bytree=0.9,
                        objective= 'binary:logistic', 
                        nthread=4,
                        scale_pos_weight=1, 
                        seed=27,
                        reg_alpha=0.01,
                        reg_lambda=1.0
                    )  

modelMetrics(xgb1, train_X_scaled, train_y, test_X_scaled, test_y)

n_estimators = 100

Model Report
Accuracy : 0.9143
AUC Score (Train): 0.972958

Model Report
Accuracy : 0.8101
AUC Score (Test): 0.835940


In [293]:
with open(os.getcwd() + "/test.csv") as test_file:
    test_data = pd.read_csv(test_file, delimiter=',')

#reading data structure information

test_data.info(verbose = True, show_counts = True)

#data example

#test_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [294]:
P_ID = test_data['PassengerId']
test_data = test_data.drop(['PassengerId', 'Cabin', 'Ticket', 'Name', 'SibSp', 'Parch', 'Embarked'], axis = 1)

In [295]:
test_data['Sex'] = [0 if x == 'female' else 1 for x in test_data['Sex']]
test_data['Age'] = [test_data['Age'].mean() if np.isnan(x) else x for x in test_data['Age']]
test_data['Fare'] = [test_data['Fare'].mean() if np.isnan(x) else x for x in test_data['Fare']]
test_data['Pclass'] = test_data['Pclass'].astype('category')
test_data_encoded = pd.get_dummies(test_data,drop_first= True)

In [296]:
#scalling features
#scaler = StandardScaler()
#scaler = MinMaxScaler()
#scaler.fit(test_data_encoded)
test_X_scaled = scaler.transform(test_data_encoded)

In [228]:
#i select the number of main components to achieve maximum 'roc_auc_score' metric, it turned out "5"
#pca = PCA(n_components=5)
#pca.fit(test_X_scaled)
test_X_scaled_pca = pca.transform(test_X_scaled)

In [297]:
test_y = xgb1.predict(test_X_scaled)

In [298]:
survived = pd.DataFrame(P_ID)
survived['Survived'] = list(test_y)

In [299]:
path_ = os.getcwd() + "/survived1.csv"
survived.to_csv(path_, sep = ',', index = False)

In [170]:
with open(os.getcwd() + "/survived.csv") as test_file: test_data = pd.read_csv(test_file, delimiter=',')
with open(os.getcwd() + "/survived1.csv") as test_file: test_data1 = pd.read_csv(test_file, delimiter=',')  

In [173]:
test_data['Survived1'] = test_data1['Survived']

In [174]:
test_data

Unnamed: 0,PassengerId,Survived,Survived1
0,892,0,0
1,893,1,0
2,894,0,0
3,895,0,0
4,896,0,0
...,...,...,...
413,1305,0,0
414,1306,1,1
415,1307,0,0
416,1308,0,0


In [95]:
# Tune learning_rate
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# grid search
model = XGBClassifier(use_label_encoder=False)

#n_estimators =[1000] #[int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
#max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
#learning_rate = [x for x in np.linspace(start = 0.00001, stop = 1, num = 10)]

#param_grid = {'n_estimators': n_estimators,
#               'max_depth': max_depth,
#               'learning_rate' : learning_rate}


param_grid = {  
                'learning_rate' : [0.1],
                'n_estimators' : [1000],
                'max_depth' : [5],
                'min_child_weight' : [1],
                'gamma' : [0],
                'subsample' : [0.8],
                #olsample_bytree=0.8,
                'objective' : ['binary:logistic'],
                'nthread' : [4],
                'scale_pos_weight' : [1],
                'seed' : [27]
             }




kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(train_X_scaled_pca, train_y)
# summarize results
grid_result.best_params_



{'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_weight': 1,
 'n_estimators': 1000,
 'nthread': 4,
 'objective': 'binary:logistic',
 'scale_pos_weight': 1,
 'seed': 27,
 'subsample': 0.8}

In [96]:
#applying the best parameters to the model
XGB = grid_result.best_estimator_

#fit the model
XGB.fit(train_X_scaled_pca, train_y)

#calcuting 'roc_auc_score' metric
print(XGB.score(train_X_scaled_pca, train_y))
print(XGB.score(test_X_scaled_pca, test_y))

0.9845505617977528
0.7653631284916201
