In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

train = pd.read_csv('adjtrain2.csv')
test = pd.read_csv('adjtest2.csv')

In [166]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
 12  Title        891 non-null    object 
 13  Cabin_Ex     891 non-null    int64  
 14  Family       891 non-null    int64  
 15  Age_Band     891 non-null    object 
dtypes: float64(2), int64(8), object(6)
memory usage: 111.5+ KB


#### Feature Choice /

1. Sex, Cabin_Ex 이대로 사용
2. Title, Pclass dummy 를 통한 on-hot encoding 
3. Age 범주화 > dummy
4. SibSp, Parch 합쳐서 Family 로 사용하려 했으나 일단 보류 

In [167]:
features_drop = ['SibSp','Parch','Ticket','Cabin','Family','Name']
train = train.drop(features_drop, axis=1)
test = test.drop(features_drop, axis=1)

In [168]:
train_and_test = [train, test]

In [169]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

for dataset in train_and_test:
    scaler.fit(dataset[['Age','Fare']])
    sub_set = scaler.transform(dataset[['Age','Fare']])
    dataset['Age'] = sub_set[:,0]
    dataset['Fare'] = sub_set[:,1]


In [170]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Cabin_Ex,Age_Band
0,1,0,3,0,0.271174,0.006369,S,Mr,0,O
1,2,1,1,1,0.472229,0.13234,C,Baby&male,1,O
2,3,1,3,1,0.321438,0.007697,S,Baby&male,0,O
3,4,1,1,1,0.434531,0.096569,S,Baby&male,1,O
4,5,0,3,0,0.434531,0.007943,S,Mr,0,O


In [171]:
for dataset in train_and_test:
    dataset['Pclass'] = dataset['Pclass'].replace(1,'A')
    dataset['Pclass'] = dataset['Pclass'].replace(2,'B')
    dataset['Pclass'] = dataset['Pclass'].replace(3,'C')

In [172]:
train = pd.get_dummies(train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

In [173]:
train.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,Fare,Cabin_Ex,Pclass_B,Pclass_C,Embarked_Q,Embarked_S,Title_Mr,Title_Others,Age_Band_O,Age_Band_T,Age_Band_VO
0,1,0,0,0.271174,0.006369,0,0,1,0,1,1,0,1,0,0
1,2,1,1,0.472229,0.13234,1,0,0,0,0,0,0,1,0,0
2,3,1,1,0.321438,0.007697,0,0,1,0,1,0,0,1,0,0
3,4,1,1,0.434531,0.096569,1,0,0,0,1,0,0,1,0,0
4,5,0,0,0.434531,0.007943,0,0,1,0,1,1,0,1,0,0


In [174]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle


In [175]:
features_drop = ['PassengerId','Survived']
train_features = train.drop(features_drop, axis=1)
train_label = train['Survived']
test_features = test.drop('PassengerId', axis=1)

print(train_features.shape, train_label.shape, test_features.shape)

(891, 13) (891,) (418, 13)


In [178]:
def ModelTest(model):

    model.fit(train_features, train_label)
    prediction = model.predict(test_features)
    accuracy = round(cross_val_score(model, train_features, train_label, cv=5).mean()*100,4)
    print ("Accuracy : ", accuracy, "%")
    return prediction


In [179]:
SVC = ModelTest(SVC())

Accuracy :  82.0419 %


In [180]:
submission = pd.DataFrame({"PassengerId" : test["PassengerId"],
                           "Survived" : SVC})
submission.to_csv('C:/Users/JH/Desktop/SVC01.csv',index=False)

In [181]:
train.corr()

Unnamed: 0,PassengerId,Survived,Sex,Age,Fare,Cabin_Ex,Pclass_B,Pclass_C,Embarked_Q,Embarked_S,Title_Mr,Title_Others,Age_Band_O,Age_Band_T,Age_Band_VO
PassengerId,1.0,-0.005007,-0.042939,0.042675,0.021055,0.019919,-8.6e-05,-0.029486,-0.033606,0.022204,0.03885,0.070231,0.040606,-0.021727,-0.021118
Survived,-0.005007,1.0,0.543351,-0.089115,0.247268,0.316912,0.093349,-0.322308,0.00365,-0.149683,-0.549199,0.013988,-0.095132,0.023499,-0.032587
Sex,-0.042939,0.543351,1.0,-0.117218,0.172352,0.140391,0.064746,-0.137143,0.074115,-0.119224,-0.867334,-0.044146,-0.057375,0.078411,-0.057593
Age,0.042675,-0.089115,-0.117218,1.0,0.09661,0.235442,0.005975,-0.291092,-0.047616,-0.006356,0.231281,0.172515,0.299038,-0.345688,0.500953
Fare,0.021055,0.247268,0.172352,0.09661,1.0,0.488305,-0.118555,-0.425884,-0.12139,-0.153853,-0.17611,0.03657,-0.020677,-0.009139,0.06408
Cabin_Ex,0.019919,0.316912,0.140391,0.235442,0.488305,1.0,-0.172413,-0.539291,-0.129572,-0.101139,-0.137319,0.111841,-0.018965,-0.062331,0.14773
Pclass_B,-8.6e-05,0.093349,0.064746,0.005975,-0.118555,-0.172413,1.0,-0.56521,-0.127301,0.18998,-0.088569,0.043336,0.022055,-0.049012,-0.026643
Pclass_C,-0.029486,-0.322308,-0.137143,-0.291092,-0.425884,-0.539291,-0.56521,1.0,0.237449,-0.015104,0.155907,-0.192083,-0.032203,0.100201,-0.145592
Embarked_Q,-0.033606,0.00365,0.074115,-0.047616,-0.12139,-0.129572,-0.127301,0.237449,1.0,-0.499421,-0.078338,-0.00586,0.042304,-0.010733,-0.018017
Embarked_S,0.022204,-0.149683,-0.119224,-0.006356,-0.153853,-0.101139,0.18998,-0.015104,-0.499421,1.0,0.11287,-0.057508,0.011784,-0.027372,-0.012324
