In [26]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

In [3]:
data = pd.read_csv("datasets/titanic.csv",decimal=",")
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest
0,3,0,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,
1,3,0,"Abbott, Master. Eugene Joseph",male,13.0,0,2,C.A. 2673,20.25,,S,"East Providence, RI"
2,3,0,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.25,,S,"East Providence, RI"
3,3,1,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,,S,"East Providence, RI"
4,3,1,"Abelseth, Miss. Karen Marie",female,16.0,0,0,348125,7.65,,S,"Norway Los Angeles, CA"


In [4]:
data[['female', 'male']] = pd.get_dummies(data.sex)

In [5]:
data.cabin.unique()

array([nan, 'F G63', 'B5', 'C22 C26', 'E12', 'D7', 'A36', 'C101',
       'C62 C64', 'B35', 'D', 'A23', 'B58 B60', 'D15', 'C6', 'F4', 'D35',
       'D56', 'C148', 'C97', 'B49', 'C99', 'C52', 'T', 'A31', 'C7',
       'C103', 'D22', 'E33', 'A21', 'B10', 'F33', 'B4', 'E40', 'B38',
       'E24', 'B51 B53 B55', 'B96 B98', 'C46', 'E31', 'E8', 'B61', 'B77',
       'A9', 'C89', 'E58', 'E49', 'E52', 'E45', 'B22', 'B26', 'C85',
       'E17', 'B71', 'B20', 'A34', 'C86', 'A16', 'A20', 'A18', 'C54',
       'C45', 'D20', 'A29', 'E25', 'C111', 'C23 C25 C27', 'E36', 'D40',
       'B41', 'B39', 'C123', 'E63', 'C130', 'B86', 'C92', 'A5', 'C51',
       'B42', 'C91', 'C125', 'D10 D12', 'B82 B84', 'E50', 'D33', 'C83',
       'B94', 'D49', 'B69', 'B11', 'C39', 'B18', 'D11', 'C93', 'B28',
       'C49', 'B52 B54 B56', 'E60', 'C132', 'B37', 'D21', 'D19', 'F E57',
       'D17', 'B101', 'D28', 'D6', 'D9', 'B80', 'E77', 'C106', 'B79',
       'D30', 'C90', 'E46', 'E38', 'C78', 'F G73', 'C30', 'E121', 'C118',
      

In [6]:
data['dummy_cabin'] = data['cabin']
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest,female,male,dummy_cabin
0,3,0,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,,0,1,
1,3,0,"Abbott, Master. Eugene Joseph",male,13.0,0,2,C.A. 2673,20.25,,S,"East Providence, RI",0,1,
2,3,0,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.25,,S,"East Providence, RI",0,1,
3,3,1,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,,S,"East Providence, RI",1,0,
4,3,1,"Abelseth, Miss. Karen Marie",female,16.0,0,0,348125,7.65,,S,"Norway Los Angeles, CA",1,0,


In [7]:
type(data.loc[1]['cabin'])

float

In [8]:
data.dummy_cabin = data.dummy_cabin.fillna(0)
data.cabin = data.cabin.fillna(0)

In [9]:
data.dummy_cabin[data.dummy_cabin != 0] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [10]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest,female,male,dummy_cabin
0,3,0,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,0,S,,0,1,0
1,3,0,"Abbott, Master. Eugene Joseph",male,13.0,0,2,C.A. 2673,20.25,0,S,"East Providence, RI",0,1,0
2,3,0,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.25,0,S,"East Providence, RI",0,1,0
3,3,1,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,0,S,"East Providence, RI",1,0,0
4,3,1,"Abelseth, Miss. Karen Marie",female,16.0,0,0,348125,7.65,0,S,"Norway Los Angeles, CA",1,0,0


In [11]:
data.fare.unique()

array([  7.55  ,  20.25  ,   7.65  ,  24.    ,   7.925 ,   7.2292,
         7.25  ,   8.05  ,   9.475 ,   9.35  ,  18.7875,  13.    ,
         7.8875,   7.05  , 211.3375, 151.55  ,   8.3   ,   7.8542,
        22.525 ,  26.55  ,  31.275 ,   7.775 ,   7.7958,  11.5   ,
        10.5   ,  77.9583,   0.    ,   7.8958,  26.    ,  51.4792,
        17.8   ,  49.5042,  31.3875,   7.225 , 227.525 ,  14.4583,
        69.3   ,  15.85  ,  19.2583,  14.4542,  78.85  ,  30.    ,
         7.8792,  12.525 , 247.5208,  76.2917,  75.2417,  39.    ,
        52.5542,   4.0125,  56.4958, 221.7792,  91.0792, 135.6333,
         7.75  ,  35.5   ,  31.    , 164.8667,  15.2458,  15.5   ,
       262.375 ,  16.1   ,  55.    ,   7.725 ,  30.5   ,  50.4958,
         7.0458,  27.7208,   7.2833,   7.8208,   6.75  , 134.5   ,
         8.6625,  26.2875,  29.    ,  21.    ,  27.4458, 512.3292,
         5.    ,  47.1   , 120.    ,  61.175 ,  53.1   ,  13.5   ,
         7.7333,  86.5   ,  29.7   , 136.7792,   7.4958,  25.5

In [12]:
data['floor'] = data['cabin'].astype(str).str[0]

In [13]:
data.floor.unique()

array(['0', 'F', 'B', 'C', 'E', 'D', 'A', 'T', 'G'], dtype=object)

In [14]:
data[['0', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']] = pd.get_dummies(data.floor)

In [15]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,...,floor,0,A,B,C,D,E,F,G,T
0,3,0,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,0,...,0,1,0,0,0,0,0,0,0,0
1,3,0,"Abbott, Master. Eugene Joseph",male,13.0,0,2,C.A. 2673,20.25,0,...,0,1,0,0,0,0,0,0,0,0
2,3,0,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.25,0,...,0,1,0,0,0,0,0,0,0,0
3,3,1,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,0,...,0,1,0,0,0,0,0,0,0,0
4,3,1,"Abelseth, Miss. Karen Marie",female,16.0,0,0,348125,7.65,0,...,0,1,0,0,0,0,0,0,0,0


In [16]:
data[['C', 'Q', 'S']] = pd.get_dummies(data.embarked)

In [17]:
pd.get_dummies(data['home.dest'])

Unnamed: 0,"?Havana, Cuba","Aberdeen / Portland, OR","Albany, NY","Altdorf, Switzerland","Amenia, ND","Antwerp, Belgium / Stanton, OH",Argentina,"Asarum, Sweden Brooklyn, NY","Ascot, Berkshire / Rochester, NY","Auburn, NY",...,"Wimbledon Park, London / Hayling Island, Hants","Windsor, England New York, NY","Winnipeg, MB","Winnipeg, MN","Woodford County, KY","Worcester, England","Worcester, MA","Yoevil, England / Cottage Grove, OR","Youngstown, OH","Zurich, Switzerland"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1043,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
data = data.drop(['sex', 'ticket', 'name', 'cabin', 'home.dest'], axis=1)
data.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,embarked,female,male,dummy_cabin,...,A,B,C,D,E,F,G,T,Q,S
0,3,0,42.0,0,0,7.55,S,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,3,0,13.0,0,2,20.25,S,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,3,0,16.0,1,1,20.25,S,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,3,1,35.0,1,1,20.25,S,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3,1,16.0,0,0,7.65,S,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [19]:
data.isna().sum()

pclass         0
survived       0
age            0
sibsp          0
parch          0
fare           1
embarked       2
female         0
male           0
dummy_cabin    0
floor          0
0              0
A              0
B              0
C              0
D              0
E              0
F              0
G              0
T              0
Q              0
S              0
dtype: int64

In [20]:
data = data.dropna()

In [21]:
data_X = data[['pclass', 'age', 'sibsp', 'parch', 'fare', 
               'female', 'male', 'C', 'Q', 'S', 'dummy_cabin', 
               '0','A','B','C','D','E','F','G','T']]

data_y = data.survived

In [22]:
data_X.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,female,male,C,Q,S,dummy_cabin,0,A,B,C.1,D,E,F,G,T
0,3,42.0,0,0,7.55,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
1,3,13.0,0,2,20.25,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
2,3,16.0,1,1,20.25,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
3,3,35.0,1,1,20.25,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0
4,3,16.0,0,0,7.65,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0


In [23]:
data.isna().sum()

pclass         0
survived       0
age            0
sibsp          0
parch          0
fare           0
embarked       0
female         0
male           0
dummy_cabin    0
floor          0
0              0
A              0
B              0
C              0
D              0
E              0
F              0
G              0
T              0
Q              0
S              0
dtype: int64

In [27]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, 
                                                    test_size=0.2, 
                                                    random_state=123)

# zdefiniuj listę pipelineów: jeden pipe to jeden model (regresja, drzewo, bayes)
# zdefiniuj listę parametrów dla każdego pipeline
modele = [
    (Pipeline([("scaler", StandardScaler()), ("regresja", LogisticRegression(solver="liblinear"))]),
    {"regresja__penalty": ["l1", "l2"],
    "regresja__C": 10.0**np.arange(-2, 2, 1)}),
    
    (Pipeline([("drzewo", DecisionTreeClassifier())]),
    {"drzewo__criterion": ["gini", "entropy"],
    "drzewo__max_depth": [2, 5, 10, 15, None]}),
    
    (Pipeline([("bayes", MultinomialNB())]),
    {"bayes__alpha": [0.3, 0.5, 0.8],
    "bayes__fit_prior": [True, False]}),
    
     (Pipeline([("scaler", StandardScaler()), ("svc", SVC())]),
     [{"svc__kernel":["poly"], "svc__degree": [2,3,4,5, 6, 7]},
     {"svc__kernel": ["rbf"], "svc__gamma": [0.1,0.2, 0.15, 1, 2, 4, 10]},
     {"svc__kernel": ["sigmoid", "linear"]}]),
    
    (Pipeline([("scaler", StandardScaler()), ("bagging", BaggingClassifier())]),
     {"bagging__base_estimator": [LogisticRegression(solver="liblinear"), DecisionTreeClassifier()],
     "bagging__n_estimators": [1,10],
     "bagging__max_features": [0.2, 0.8]}
    ),
    
    (Pipeline([("randomForest", RandomForestClassifier())]),
     {"randomForest__n_estimators": [1,10,100]}
    )
    
    ]


# dla każdej pary pipeline-parametry:
#     grid search cv
#     zapamiętaj najlepszy model
najlepsze = []
for model, parametry in modele:
    print(model.steps[-1][0])
    gs = GridSearchCV(model, parametry, cv=5, verbose=1, n_jobs=10)
    # tu w fit() jest ukryty podział na treningową i walidacyjną, to tu wstępnie się mierzy jakość predykcji naniewidzianych danych
    # ale ponieważ używamy tych walidacyjnych do dobrania parametrów, to tak naprawdę one też biorą udział w procesie uczenia i są "skażone"
    gs.fit(X_train, y_train)
    najlepsze.append((gs.best_estimator_, gs.best_params_)) 

# dla każdego najlepszego modelu:
#    uruchom na teście, zobacz wyniki
for model, parametry in najlepsze:
    print(model.steps[-1][0])
    print(parametry)
    print(accuracy_score(model.predict(X_test),y_test))

regresja
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  40 out of  40 | elapsed:    1.9s finished


drzewo
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    1.5s
[Parallel(n_jobs=10)]: Done  50 out of  50 | elapsed:    1.7s finished


bayes
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 out of  30 | elapsed:    1.7s finished


svc
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    2.0s
[Parallel(n_jobs=10)]: Done  75 out of  75 | elapsed:    2.3s finished


bagging
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  40 out of  40 | elapsed:    2.0s finished


randomForest
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  12 out of  15 | elapsed:    1.9s remaining:    0.5s
[Parallel(n_jobs=10)]: Done  15 out of  15 | elapsed:    2.1s finished


regresja
{'regresja__C': 0.01, 'regresja__penalty': 'l1'}
0.7751196172248804
drzewo
{'drzewo__criterion': 'entropy', 'drzewo__max_depth': 5}
0.8133971291866029
bayes
{'bayes__alpha': 0.3, 'bayes__fit_prior': False}
0.6985645933014354
svc
{'svc__gamma': 0.15, 'svc__kernel': 'rbf'}
0.8421052631578947
bagging
{'bagging__base_estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False), 'bagging__max_features': 0.8, 'bagging__n_estimators': 10}
0.784688995215311
randomForest
{'randomForest__n_estimators': 100}
0.7990430622009569
