#### 데이터 수집

In [37]:
import pandas as pd

titanic_train_df = pd.read_csv("./datasets/titanic_train.csv")
titanic_test_df = pd.read_csv("./datasets/titanic_test.csv")
titanic_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
titanic_test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


#### Exploratory Data Analysis, 탐색적 데이터 분석

In [39]:
titanic_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### 데이터 전처리

#### null값 처리

In [40]:
mean = titanic_train_df['Age'].mean()

In [41]:
titanic_train_df['Age'] = titanic_train_df['Age'].fillna(mean)
titanic_test_df['Age'] = titanic_test_df['Age'].fillna(mean)

#### 데이터 인코딩

In [42]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(titanic_train_df['Sex'])

titanic_train_df['Sex'] = le.transform(titanic_train_df['Sex'])
titanic_test_df['Sex'] = le.transform(titanic_test_df['Sex'])



In [43]:
titanic_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int32  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int32(1), int64(5), object(4)
memory usage: 80.2+ KB


### 데이터 스케일링

In [44]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
scaler = MinMaxScaler()
np.array(titanic_train_df['Age'])
scaler.fit(np.array(titanic_train_df['Age']).reshape(-1,1))
titanic_train_df['Age'] = scaler.transform(np.array(titanic_train_df['Age']).reshape(-1,1))
titanic_test_df['Age'] = scaler.transform(np.array(titanic_test_df['Age']).reshape(-1,1))


In [47]:
titanic_train_df['Age']

0      0.271174
1      0.472229
2      0.321438
3      0.434531
4      0.434531
         ...   
886    0.334004
887    0.233476
888    0.367921
889    0.321438
890    0.396833
Name: Age, Length: 891, dtype: float64

### feature Selection

In [62]:
gender_submission = pd.read_csv('./datasets/titanic_gender_submission.csv')

X_train_df = titanic_train_df[['Pclass','Sex','Age']]
y_train= titanic_train_df['Survived']

X_test_df = titanic_test_df[['Pclass','Sex','Age']]
y_test = gender_submission['Survived']

gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### 데이터 모델링

In [64]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

#cross_val_score를 사용
from sklearn.model_selection import cross_val_score

dt_clf = DecisionTreeClassifier(random_state =42)
rf_clf = RandomForestClassifier(random_state =42)
lr_clf = LogisticRegression()

algorithm_list = [dt_clf,rf_clf,lr_clf]


for clf in algorithm_list:
    scores = cross_val_score(clf,X_train_df,y_train,cv=3)
    mean_score = scores.mean()
    print(f'scores는 : {scores}')
    print(f'평균검증값 : {mean_score}')
    print(f'=========================================')

scores는 : [0.77441077 0.81481481 0.81144781]
평균검증값 : 0.8002244668911335
scores는 : [0.76430976 0.82491582 0.81818182]
평균검증값 : 0.8024691358024691
scores는 : [0.78114478 0.80808081 0.77104377]
평균검증값 : 0.7867564534231201


##### GridSearchCV를 활용해서 randomforest의 최적의 하이퍼파라미터 설정

In [69]:
from sklearn.model_selection import GridSearchCV

rf_clf = RandomForestClassifier(random_state=42)

param_dt = {"max_depth" : [5,7,10],
          "min_samples_split" : [2,3]
}

gscv_rf = GridSearchCV(estimator = rf_clf, param_grid = param_dt, scoring ='accuracy', cv = 3, refit=True, n_jobs=1, verbose=0)

gscv_rf.fit(X_train_df,y_train)



In [70]:
gscv_rf.best_params_

{'max_depth': 10, 'min_samples_split': 3}

#### Evaluate

In [71]:
from sklearn.metrics import accuracy_score
pred = gscv_rf.predict(X_test_df)
accuracy_score(pred,y_test)

0.8827751196172249