In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier 
from xgboost import XGBClassifier
from sklearn.metrics import f1_score


In [2]:
def get_tr_set():
    df_tr = pd.read_csv(r'train.csv')
    return df_tr

tr = get_tr_set()
tr.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
def get_t_set():
    df_t = pd.read_csv(r'test.csv')
    return df_t

t = get_t_set()
t.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
tr.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [5]:
tr.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
tr.drop(['Ticket','Embarked','PassengerId', 'Name'], axis = 1, inplace=True)
tr.Sex = tr.Sex.map({'male': 1, 'female':0})
tr['Cabin'] = pd.to_numeric(tr['Cabin'].str.replace('\D', ''), errors='coerce')

tr["Cabin"] = tr["Cabin"].fillna('-1')
tr['Age'] = tr['Age'].fillna(tr['Age'].mean())
tr['NotAlone'] = tr.Parch + tr.SibSp
tr.drop(['Parch', 'SibSp'], axis=1, inplace=True)
cut_l = [-1, tr.Fare.max()/4, tr.Fare.max()/4*2,tr.Fare.max()/4*3,np.inf]
tr['ti_pr'] = pd.cut(tr['Fare'],  bins = cut_l, labels=[1,2,3,4])
tr.drop('Fare', axis=1, inplace=True)
tr_targ = tr['Survived']
tr.drop('Survived', axis=1, inplace=True)


  tr['Cabin'] = pd.to_numeric(tr['Cabin'].str.replace('\D', ''), errors='coerce')


In [7]:
tr

Unnamed: 0,Pclass,Sex,Age,Cabin,NotAlone,ti_pr
0,3,1,22.000000,-1,1,1
1,1,0,38.000000,85.0,1,1
2,3,0,26.000000,-1,0,1
3,1,0,35.000000,123.0,1,1
4,3,1,35.000000,-1,0,1
...,...,...,...,...,...,...
886,2,1,27.000000,-1,0,1
887,1,0,19.000000,42.0,0,1
888,3,0,29.699118,-1,3,1
889,1,1,26.000000,148.0,0,1


In [8]:
tr.isnull().sum()

Pclass      0
Sex         0
Age         0
Cabin       0
NotAlone    0
ti_pr       0
dtype: int64

In [9]:
X, y  = tr, tr_targ == 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


In [10]:
clf = DecisionTreeClassifier(random_state=0)
params_grid = {'max_depth': np.arange(3, 8),
              'max_leaf_nodes':list(range(4,20,4)),
              'max_features':[4,5,6]
               }
grid_clf = GridSearchCV(clf, params_grid)
grid_clf.fit(X_train, y_train)

print('Grid best parameter: ', grid_clf.best_params_)
print('Grid best score: ', grid_clf.best_score_)

Grid best parameter:  {'max_depth': 4, 'max_features': 6, 'max_leaf_nodes': 12}
Grid best score:  0.833879474806419


In [11]:
def normalize(X_train, X_test):
    print ('normalizing.')
    scaler= MinMaxScaler()
    X_train_scaled= scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled 

X_train_scaled, X_test_scaled  = normalize(X_train, X_test)

normalizing.


In [12]:
clf = RandomForestClassifier(n_estimators= 20)       

from sklearn.model_selection import cross_val_score 
f1_train = cross_val_score(clf, X_train_scaled, y_train, cv=5, scoring='f1')
print (f1_train)
            
clf.fit(X_train_scaled, y_train)
y_predict= clf.predict(X_test_scaled)

np.mean(y_predict)

[0.70212766 0.72897196 0.71028037 0.68817204 0.78787879]


0.37668161434977576

In [13]:
def run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train, y_test= None):


    print ('Running GridSearchCV.')
    grid_clf = GridSearchCV(clf, param_grid=grid_values,scoring='f1')
    grid_clf.fit(X_train_scaled, y_train)
    print('Grid best parameter (max f1 ): ', grid_clf.best_params_) 
    print('Grid best score (f1): ', grid_clf.best_score_) 

    if not y_test is None:
        test_score= grid_clf.score(X_test_scaled, y_test)
        print("test f1= {}".format(test_score))


def run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test=None, list_classifiers= None):
    if list_classifiers is None or 'LogisticRegression' in list_classifiers:
        print ('\nLogisticRegression.')
        clf = LogisticRegression(max_iter=10000)
        grid_values = {'C': [0.005, 0.01,0.1, 1, 100, 10000, 100000]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled,  y_train,  y_test= y_test)

    if list_classifiers is None or 'DecisionTreeClassifier' in list_classifiers:
        print ('\nDecisionTreeClassifier')
        clf = DecisionTreeClassifier()       
        grid_values = {'max_depth': [2,5,7, 20, 50]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'RandomForestClassifier' in list_classifiers:
        print ('\nRandomForestClassifier.')
        clf = RandomForestClassifier()       
        grid_values = {'n_estimators': [20,50,200,300]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'SVC_poly' in list_classifiers:
        print ('\nSVC_poly')
        clf = SVC(kernel='poly')           
        grid_values = {'C': [0.01 , 0.1, 1, 100, ]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'SVC_rbf' in list_classifiers:
        print ('\nSVC_rbf')
        clf = SVC(kernel='rbf')
        grid_values = {'C': [0.005, 0.01, 0.02, 0.03, 0.1, 1, 100, 10000], 'gamma':[0.001, 0.01, 0.1]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'NB' in list_classifiers:
        print ('\nNB')
        clf =  GaussianNB().fit(X_train_scaled, y_train)
        train_f1 = f1_score(y_train, clf.predict(X_train_scaled))
        print("train set f1= {}".format(train_f1))
        if not y_test is None:
            test_f1 = f1_score(y_test, clf.predict(X_test_scaled))
            print("train set f1= {}".format(test_f1))

            
    if list_classifiers is None or 'GradientBoostingClassifier' in list_classifiers:
        print ('\nGradientBoostingClassifier.')
        clf = GradientBoostingClassifier()       
        grid_values = {'max_depth': [3,5,7]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

   
    if list_classifiers is None or 'xgboost' in list_classifiers:
        print ('\nxgboost.')
        clf = XGBClassifier().fit(X_train_scaled, y_train)
        y_predicted = clf.predict(X_test_scaled)
        print ('f1_score  = {:.2}'.format(f1_score(y_test, y_predicted)))
        



In [20]:
list_classifiers= [
    'LogisticRegression',
    'DecisionTreeClassifier',
    'RandomForestClassifier',
    'NB',
    'GradientBoostingClassifier', 
    # 'xgboost', 
    
    
   
]
run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test, list_classifiers= list_classifiers)


LogisticRegression.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'C': 1}
Grid best score (f1):  0.7254904313206164
test f1= 0.7073170731707318

DecisionTreeClassifier
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 5}
Grid best score (f1):  0.7413745877675059
test f1= 0.7435897435897436

RandomForestClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'n_estimators': 20}
Grid best score (f1):  0.7238425711488627
test f1= 0.7073170731707318

NB
train set f1= 0.7226890756302521
train set f1= 0.7204968944099378

GradientBoostingClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 3}
Grid best score (f1):  0.757977968333494
test f1= 0.7730061349693252


In [21]:
clf = RandomForestClassifier(n_estimators= 20)       

from sklearn.model_selection import cross_val_score
f1_train = cross_val_score(clf, X_train_scaled, y_train, cv=5, scoring='f1')
print (f1_train)
            
clf.fit(X_train_scaled, y_train)
y_predict= clf.predict(X_test_scaled)

np.mean(y_predict)

[0.7        0.73584906 0.70588235 0.72340426 0.75510204]


0.3811659192825112

In [22]:
clf = DecisionTreeClassifier(random_state=0, max_depth = 4,max_leaf_nodes = 12,max_features= 6).fit(X_train, y_train)
clf.score(X_test,y_test)

0.820627802690583