In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.optimize as sp
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import xgboost as xgb
%pylab inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn import linear_model, model_selection, metrics, tree, ensemble 


Populating the interactive namespace from numpy and matplotlib


# Reading and exploring data.

In [2]:
data = pd.read_csv('../input/titanic/train.csv', index_col='PassengerId')
data.head(n=10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
dataT = pd.read_csv('../input/titanic/test.csv', index_col='PassengerId')
dataT.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


## First of all, we need to delete the outfitting data, but I will add this in the next version
## Then, we'll transform all the data 
Replace the unnecessary ones or delete them, and then splite the data and answers into x and y.


### Fill the missing values in the Age column (there are too many empty values in it to simply delete them. So fill them in with median for all age values)

In [4]:
data.Age.fillna(float(data.Age.median(axis=0)), inplace = True)
dataT.Age.fillna(float(dataT.Age.median(axis=0)), inplace = True)

### Fill the missing values in train dataset's Embarked column with most common values

In [5]:
data.Embarked.fillna(data.Embarked.mode()[0], inplace = True)

### Fill the missing values in test dataset's Fare column with the median for all values

In [6]:
dataT.Fare.fillna(float(dataT.Fare.median(axis=0)), inplace = True)
dataT.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       418 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      418 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


### Will use only informative features and transform qualitative features into quantitative

In [7]:
#Leave features of sex, family size, age and service class
features = ["Pclass", "Sex", "SibSp", "Parch", "Age", "Fare", "Embarked"]
X_train = pd.get_dummies(data[features])
X_test = pd.get_dummies(dataT[features])

y_train = data["Survived"]

print((X_train))
print((X_test))

             Pclass  SibSp  Parch   Age     Fare  Sex_female  Sex_male  \
PassengerId                                                              
1                 3      1      0  22.0   7.2500           0         1   
2                 1      1      0  38.0  71.2833           1         0   
3                 3      0      0  26.0   7.9250           1         0   
4                 1      1      0  35.0  53.1000           1         0   
5                 3      0      0  35.0   8.0500           0         1   
...             ...    ...    ...   ...      ...         ...       ...   
887               2      0      0  27.0  13.0000           0         1   
888               1      0      0  19.0  30.0000           1         0   
889               3      1      2  28.0  23.4500           1         0   
890               1      0      0  26.0  30.0000           0         1   
891               3      0      0  32.0   7.7500           0         1   

             Embarked_C  Embarked_Q  

# Modeling

### LogisticRegression

In [8]:
param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}

estimator = linear_model.LogisticRegression(solver='liblinear', penalty = 'l2', random_state = 1)
optimizerLR = GridSearchCV(estimator, param_grid,  cv=3)                    
optimizerLR.fit(X_train, y_train)

print('score_train_opt', optimizerLR.best_score_)
print('param_opt', optimizerLR.best_params_)


score_train_opt 0.7923681257014591
param_opt {'C': 0.1}


### RidgeClassifier 

In [9]:
param_grid = {'alpha': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}

estimator = linear_model.RidgeClassifier( random_state = 1)
optimizerR = GridSearchCV(estimator, param_grid,  cv=3)                    
optimizerR.fit(X_train, y_train)

print('score_train_opt', optimizerR.best_score_)
print('param_opt', optimizerR.best_params_)

score_train_opt 0.7901234567901234
param_opt {'alpha': 10}


### DecisionTree

In [10]:
param_grid = {'max_depth': list(range(1, 10)), 'min_samples_leaf': list(range(2, 10))}#, 'class_weightdict': ["balanced", "None"]}

estimator = tree.DecisionTreeClassifier(random_state = 1)
optimizerDT = GridSearchCV(estimator, param_grid,  cv=3)                    
optimizerDT.fit(X_train, y_train)

print('score_train_opt', optimizerDT.best_score_)
print('param_opt', optimizerDT.best_params_)


score_train_opt 0.809203142536476
param_opt {'max_depth': 3, 'min_samples_leaf': 2}


### Модель RandomForestClassifier

In [11]:
param_grid = {'max_depth': list(range(1, 10)), 'n_estimators': list(range(10, 100, 5)), 'min_weight_fraction_leaf': [0.001,  0.005, 0.01, 0.03, 0.05, 0.07, 0.1, 0.3, 0.5] } 

estimator = ensemble.RandomForestClassifier( random_state = 1)
optimizerRF = GridSearchCV(estimator, param_grid,  cv=3)                    
optimizerRF.fit(X_train, y_train)

print('score_train_opt', optimizerRF.best_score_)
print('param_opt', optimizerRF.best_params_)


score_train_opt 0.830527497194164
param_opt {'max_depth': 8, 'min_weight_fraction_leaf': 0.01, 'n_estimators': 10}


#### Стох.градиентный спуск

In [12]:
"""classifier = linear_model.SGDClassifier(random_state = 0, max_iter=1000)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
metrics.accuracy_score(y_test, predictions)"""

'classifier = linear_model.SGDClassifier(random_state = 0, max_iter=1000)\nclassifier.fit(X_train, y_train)\npredictions = classifier.predict(X_test)\nmetrics.accuracy_score(y_test, predictions)'

### Модель GradientBoosting

In [13]:
param_grid = {'max_depth': list(range(1, 10)), 'learning_rate': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10], 'n_estimators': list(range(10, 100, 5)) }
estimator = xgb.XGBClassifier( random_state = 1, min_child_weight=3)
optimizerXGB = GridSearchCV(estimator, param_grid,  cv=3)                    
optimizerXGB.fit(X_train, y_train)

print('score_train_opt', optimizerXGB.best_score_)
print('param_opt', optimizerXGB.best_params_)


score_train_opt 0.8361391694725029
param_opt {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 60}


In [14]:
y = pd.read_csv('../input/titanic/gender_submission.csv', index_col='PassengerId')
print(y.info)
del y['Survived']
y['Survived']=optimizerXGB.predict(X_test)
print(y)

<bound method DataFrame.info of              Survived
PassengerId          
892                 0
893                 1
894                 0
895                 0
896                 1
...               ...
1305                0
1306                1
1307                0
1308                0
1309                0

[418 rows x 1 columns]>
             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0
...               ...
1305                0
1306                1
1307                0
1308                0
1309                0

[418 rows x 1 columns]


In [15]:
y.to_csv("/kaggle/working/Titanic_answer.csv")