### scikit-learn은 파이썬으로 구현된 기계학습 라이브러리
* 분류(Classification)
Algorithms: SVM, nearest neighbors, random forest 등
* 회귀(Regression)
Algorithms: SVR, nearest neighbors, random forest 등
* 군집(Clustering)
Algorithms: k-Means, spectral clustering, mean-shift 등
* 차원 축소(Dimensionality Reduction)
Algorithms: k-Means, feature selection, non-negative matrix factorization 등
* 모델 선택(Model selection)
Algorithms: grid search, cross validation, metrics 등
* 전처리(Preprocessing)
Algorithms: preprocessing, feature extraction 등

### 훈련용 시험용 데이터 분리: train_test_split

In [1]:
import pandas as pd
train = pd.read_csv("https://raw.githubusercontent.com/developer-sdk/kaggle-python-beginner/master/datas/kaggle-titanic/train.csv")

from sklearn.model_selection import train_test_split


In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data = train.drop('Survived',axis=1).values

In [5]:
target = train['Survived'].values

In [6]:
# 파라미터
#    test_size: 분리 비율
#    startify: 데이터를 분리 하는 기준 값 
#    random_state: 랜덤 시드 함수 
# 반환 값
#   x_train: 훈련용 셋의 x 좌표 값
#   y_train: 훈련용 셋의 y 좌표 값(Survived 값)
#   x_test: 시험용 셋의 x 좌표 값
#   y_test: 시험용 셋의 y 좌표 값(Survived 값)
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size = 0.3, stratify=target, random_state=2018)

In [None]:
## 이렇게 전체 shuffle하여 섞을 수도 있다 
# import pandas as pd
# train = pd.read_csv("https://raw.githubusercontent.com/developer-sdk/kaggle-python-beginner/master/datas/kaggle-titanic/train.csv")
# test = pd.read_csv("https://raw.githubusercontent.com/developer-sdk/kaggle-python-beginner/master/datas/kaggle-titanic/test.csv")

# from sklearn.utils import shuffle

# # train 데이터를 섞음 
# train = shuffle(train)

# # train, test 데이터를 동시에 섞음 
# train, test = shuffle(train, test, random_state=5)

### Ensemble -> RandomForest

In [43]:
import pandas as pd
train = pd.read_csv("https://raw.githubusercontent.com/developer-sdk/kaggle-python-beginner/master/datas/kaggle-titanic/train.csv")

train= train[['Survived','Pclass','Sex','Age','Fare','Embarked']]

In [44]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.25,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.925,S
3,1,1,female,35.0,53.1,S
4,0,3,male,35.0,8.05,S


In [45]:
# #########################
# ... 데이터 전처리 진행 ...
# #########################
train['Sex'] = pd.factorize(train.Sex.astype("category"))[0]

In [46]:
train['Embarked'] = pd.factorize(train.Embarked.astype("category"))[0]

In [47]:
train.Age.ffill(inplace=True) ## 앞의 값으로 대충 채웠음 

In [48]:
from sklearn import metrics 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

In [49]:
# data = survived를 제외한 값
# target = survived의 값 
data = train.drop('Survived', axis=1).values
target = train['Survived'].values

# train과 valid(test) 셋을 분리 
x_train, x_valid, y_train, y_valid = train_test_split(data, target, test_size=0.3, stratify=target, random_state=34)

In [50]:
rf = RandomForestClassifier(n_estimators=100, random_state=7654321)
rf.fit(x_train, y_train)

RandomForestClassifier(random_state=7654321)

In [51]:
prediction = rf.predict(x_valid)

In [52]:
accuracy = accuracy_score(prediction, y_valid)

In [53]:
accuracy

0.7649253731343284

In [54]:
length = y_valid.shape[0]
print(f'총 {length}명 중 {accuracy * 100:.3f}% 정확도로 생존을 맞춤')

총 268명 중 76.493% 정확도로 생존을 맞춤


n_estimators=100, random_state=123456, length = 75.746%
n_estimators=100, random_state=7654321 , length = 76.493%