In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
import tensorflow as tf
warnings.filterwarnings("ignore")

In [2]:
train_data = pd.read_csv('train.csv')
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
test_data =pd.read_csv('test.csv')
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
train_data.drop(['PassengerId', 'Cabin', 'Ticket', 'Fare', 'Name'], axis=1, inplace=True)

In [6]:
train_data['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [7]:
def age_group (x):
    if (x < 18):
        return 1
    elif (x < 30):
        return 2
    elif (x < 50):
        return 3
    elif (x < 90):
        return 4

In [8]:
train_data['Age'] = train_data['Age'].apply(age_group)
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,2.0,1,0,S
1,1,1,female,3.0,1,0,C
2,1,3,female,2.0,0,0,S
3,1,1,female,3.0,1,0,S
4,0,3,male,3.0,0,0,S
...,...,...,...,...,...,...,...
886,0,2,male,2.0,0,0,S
887,1,1,female,2.0,0,0,S
888,0,3,female,,1,2,S
889,1,1,male,2.0,0,0,C


In [9]:
train_data['Age'].fillna(round(train_data['Age'].mean()), inplace=True)

In [10]:
def fe_male (x):
    if (x == 'female'):
        return 0
    else:
        return 1

In [11]:
train_data['Sex'] = train_data['Sex'].apply(fe_male)
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,1,2.0,1,0,S
1,1,1,0,3.0,1,0,C
2,1,3,0,2.0,0,0,S
3,1,1,0,3.0,1,0,S
4,0,3,1,3.0,0,0,S
...,...,...,...,...,...,...,...
886,0,2,1,2.0,0,0,S
887,1,1,0,2.0,0,0,S
888,0,3,0,2.0,1,2,S
889,1,1,1,2.0,0,0,C


In [12]:
label_encoder = LabelEncoder()
train_data['Embarked']= label_encoder.fit_transform(train_data['Embarked'])
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,1,2.0,1,0,2
1,1,1,0,3.0,1,0,0
2,1,3,0,2.0,0,0,2
3,1,1,0,3.0,1,0,2
4,0,3,1,3.0,0,0,2
...,...,...,...,...,...,...,...
886,0,2,1,2.0,0,0,2
887,1,1,0,2.0,0,0,2
888,0,3,0,2.0,1,2,2
889,1,1,1,2.0,0,0,0


In [13]:
train_data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked'], dtype='object')

In [14]:
minmax = MinMaxScaler()
stand = StandardScaler()

data_minmax = train_data.copy()
data_standard = train_data.copy()

temp1 = data_minmax.iloc[:, 1:]

temp2 = data_standard.iloc[:, 1:]

temp1 = minmax.fit_transform(temp1)
temp2 = stand.fit_transform(temp2)

In [15]:
temp1 = pd.DataFrame(temp1)
temp1

Unnamed: 0,0,1,2,3,4,5
0,1.0,1.0,0.333333,0.125,0.000000,0.666667
1,0.0,0.0,0.666667,0.125,0.000000,0.000000
2,1.0,0.0,0.333333,0.000,0.000000,0.666667
3,0.0,0.0,0.666667,0.125,0.000000,0.666667
4,1.0,1.0,0.666667,0.000,0.000000,0.666667
...,...,...,...,...,...,...
886,0.5,1.0,0.333333,0.000,0.000000,0.666667
887,0.0,0.0,0.333333,0.000,0.000000,0.666667
888,1.0,0.0,0.333333,0.125,0.333333,0.666667
889,0.0,1.0,0.333333,0.000,0.000000,0.000000


In [16]:
temp2 = pd.DataFrame(temp2)

In [17]:
data_minmax.iloc[:, 1:] = temp1
data_minmax

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,1.0,1.0,0.333333,0.125,0.000000,0.666667
1,1,0.0,0.0,0.666667,0.125,0.000000,0.000000
2,1,1.0,0.0,0.333333,0.000,0.000000,0.666667
3,1,0.0,0.0,0.666667,0.125,0.000000,0.666667
4,0,1.0,1.0,0.666667,0.000,0.000000,0.666667
...,...,...,...,...,...,...,...
886,0,0.5,1.0,0.333333,0.000,0.000000,0.666667
887,1,0.0,0.0,0.333333,0.000,0.000000,0.666667
888,0,1.0,0.0,0.333333,0.125,0.333333,0.666667
889,1,0.0,1.0,0.333333,0.000,0.000000,0.000000


In [18]:
data_standard.iloc[:, 1:] = temp2
data_standard

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,0.827377,0.737695,-0.408350,0.432793,-0.473674,0.581114
1,1,-1.566107,-1.355574,0.841958,0.432793,-0.473674,-1.938460
2,1,0.827377,-1.355574,-0.408350,-0.474545,-0.473674,0.581114
3,1,-1.566107,-1.355574,0.841958,0.432793,-0.473674,0.581114
4,0,0.827377,0.737695,0.841958,-0.474545,-0.473674,0.581114
...,...,...,...,...,...,...,...
886,0,-0.369365,0.737695,-0.408350,-0.474545,-0.473674,0.581114
887,1,-1.566107,-1.355574,-0.408350,-0.474545,-0.473674,0.581114
888,0,0.827377,-1.355574,-0.408350,0.432793,2.008933,0.581114
889,1,-1.566107,0.737695,-0.408350,-0.474545,-0.473674,-1.938460


In [19]:
minmax_train_x, minmax_test_x, minmax_train_y, minmax_test_y = train_test_split(
    data_minmax.iloc[:, 1:], data_minmax['Survived'], test_size = 0.25, random_state = 42)

standard_train_x, standard_test_x, standard_train_y, standard_test_y = train_test_split(
    data_standard.iloc[:, 1:], data_standard['Survived'], test_size = 0.25, random_state = 42)

In [20]:
#using logistic regression

logistic = LogisticRegression(penalty='l1', solver='saga')
log_model1 = logistic.fit(minmax_train_x, minmax_train_y)

In [21]:
log_predict = log_model1.predict(minmax_test_x)

In [22]:
roc_auc_score(log_predict, minmax_test_y)

0.7616582491582491

In [23]:
log_predict = log_model1.predict(minmax_train_x)
print(roc_auc_score(log_predict, minmax_train_y))

0.7840698727795501


In [24]:
log_model2 = logistic.fit(standard_train_x, standard_train_y)

In [25]:
log_predict = log_model2.predict(standard_train_x)
print(roc_auc_score(log_predict, standard_train_y))

0.7870178908567833


In [26]:
log_predict = log_model2.predict(standard_test_x)
print(roc_auc_score(log_predict, standard_test_y))

0.7903157358682736


In [27]:
sgd = SGDClassifier(max_iter=1000, tol=1e-3)
sgd_model = sgd.fit(minmax_train_x, minmax_train_y)

In [28]:
sgd_predict = sgd_model.predict(minmax_test_x)
print(roc_auc_score(sgd_predict, minmax_test_y))

0.7601291364003229


In [29]:
model_params = {
    'SVM': {
        'model': SVC(gamma='auto'),
        'params' : {
                'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
                'kernel':['linear', 'rbf' ,'poly'],
                'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
        }  
    },
    'Random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10,20,80,100,140,160,180,200]
        }
    },
    'Logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [0.1,0.5,0.8,1,5]
        }
    },
    'Naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'SGD_classifier': {
        'model': SGDClassifier(),
        'params': {}
    },
    'KNN_classifier':{
        'model': KNeighborsClassifier(),
        'params' : {}
    },
    'Decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}


In [30]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=10, return_train_score=False)
    clf.fit(standard_train_x,standard_train_y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
dfg = pd.DataFrame(scores,columns=['model','best_score','best_params'])
dfg

Unnamed: 0,model,best_score,best_params
0,SVM,0.821913,"{'C': 1.4, 'gamma': 0.1, 'kernel': 'rbf'}"
1,Random_forest,0.803957,{'n_estimators': 5}
2,Logistic_regression,0.791995,{'C': 0.5}
3,Naive_bayes_gaussian,0.779941,{}
4,SGD_classifier,0.739711,{}
5,KNN_classifier,0.799525,{}
6,Decision_tree,0.799502,{'criterion': 'entropy'}


In [31]:
svm = SVC(C = 1.4,
                kernel = "rbf" ,
                gamma = 0.1)
svm.fit(standard_train_x,standard_train_y)

In [32]:
svm_predict = svm.predict(standard_test_x)
print(roc_auc_score(svm_predict, minmax_test_y))

0.8112350804359107


In [44]:
test_data = pd.read_csv('test.csv')
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [45]:
ID = test_data['PassengerId']
test_data.drop(['PassengerId', 'Cabin', 'Ticket', 'Fare', 'Name'], axis=1, inplace=True)

In [35]:
test_data['Age'] = test_data['Age'].apply(age_group)
test_data['Age'].fillna(round(test_data['Age'].mean()), inplace=True)
test_data['Sex'] = test_data['Sex'].apply(fe_male)
label_encoder = LabelEncoder()
test_data['Embarked']= label_encoder.fit_transform(test_data['Embarked'])
test_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,1,3.0,0,0,1
1,3,0,3.0,1,0,2
2,2,1,4.0,0,0,1
3,3,1,2.0,0,0,2
4,3,0,2.0,1,1,2
...,...,...,...,...,...,...
413,3,1,2.0,0,0,2
414,1,0,3.0,0,0,0
415,3,1,3.0,0,0,2
416,3,1,2.0,0,0,2


In [36]:
test_data = stand.fit_transform(test_data)
test_data = pd.DataFrame(test_data, columns=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked'])
test_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0.873482,0.755929,0.857639,-0.499470,-0.400248,-0.470915
1,0.873482,-1.322876,0.857639,0.616992,-0.400248,0.700767
2,-0.315819,0.755929,2.156528,-0.499470,-0.400248,-0.470915
3,0.873482,0.755929,-0.441249,-0.499470,-0.400248,0.700767
4,0.873482,-1.322876,-0.441249,0.616992,0.619896,0.700767
...,...,...,...,...,...,...
413,0.873482,0.755929,-0.441249,-0.499470,-0.400248,0.700767
414,-1.505120,-1.322876,0.857639,-0.499470,-0.400248,-1.642598
415,0.873482,0.755929,0.857639,-0.499470,-0.400248,0.700767
416,0.873482,0.755929,-0.441249,-0.499470,-0.400248,0.700767


In [37]:
test_data.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

In [38]:
svm_final_predict = svm.predict(test_data)
svm_final_predict

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [39]:
result = pd.read_csv('gender_submission.csv')
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [40]:
print(roc_auc_score(svm_final_predict, result['Survived']))

0.940029596744358


In [48]:
final_sub = pd.concat([pd.Series(ID), pd.Series(svm_final_predict)], axis = 1)
final_sub

Unnamed: 0,PassengerId,0
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [49]:
final_sub.to_csv('final_sub.csv')

The End