# XgBoost 분석 

### 1. XgBoost 정의 

<p><img src="http://www.kdnuggets.com/wp-content/uploads/xgboost-tianqi-chen.jpg" alt="출처"></p>
##### 출처 : https://machinelearningmastery.com/gentle-introduction-xgboost-applied-machine-learning/

>**XGBoost 의 의미 : eXtreme Gradient Boosting**

>**Gradient boosting의 일종**

>**주요기능 : Execution Speed 증가 그리고 Model Performance 향상 **

>**두 가지 종류 : XGBClassifier and XGBRegressor classes in the XGBoost Python scikit-learn API **

---

### 2. XgBoost 의 작동방식 

- Gradient Boosting algorithm : also called gradient boosting machine including the learning rate.
- Stochastic Gradient Boosting : sub-sampling at the row, column and column per split levels.
- Regularized Gradient Boosting : both L1 and L2 regularization.<p>

---

### 3. XgBoost 적용 

XGBoost library는 gradient boosting decision tree 알고리즘에 적용 가능
Regreesion or Classification 에 모두 적용 가능



## XgBoost 예시코드 

참고링크 : <p> 공식홈페이지 : http://xgboost.readthedocs.io/en/latest/python/python_intro.html <p>
parameter tuning :https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [None]:
# First XGBoost model for Pima Indians dataset
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#### 실제 적용해본 간단한 예제코드 

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

X = [i for i in range(10)]
Y = [9-j for j in range(10)]
data = np.array([X if r%2 == 0 else Y for r in range(100)],np.float32)
label=np.array([1 if r%2 == 0 else 0 for r in range(100)],np.float32)

seed = 7
test_size = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

model = XGBClassifier()
model.fit(x_train, y_train)

# make predictions for test data
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))




# Influenza dataset을 활용한 실전 분석 

### 1. 필요한 파일 라이브러리 import 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn import utils 
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn import utils 
import numpy as np
import operator as op 
import pandas as pd
import os
import datetime

In [16]:
import google.datalab.bigquery as bq
import numpy as np
import operator as op 
import pandas as pd
import os
import apache_beam as beam
import datetime
# import tensorflow as tf 


### 2. BigQuery로 전처리가 완료된 dataset을 qeury로 불러오기

#### xgboost는 logic level로 분석하기 때문에, 사칙연산을 위한 one-hot vector로 변환하지않고 integer array dataset을 바로 적용함 

In [17]:
query = """
#standard
select 
replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(replace(
host_species,
"IRD:Chicken/Avian", "1"),
"IRD:Duck/Avian", "2"),
"IRD:Mallard/Avian", "3"),
"IRD:Environment", "4"),
"IRD:Goose/Avian", "5"),
"IRD:Muscovy Duck/Avian", "6"),
"IRD:Human", "7"),
"IRD:Turkey/Avian", "8" ),
"IRD:Openbill Stork/Avian", "9"),
"IRD:Baikal Teal/Avian", "10"),
"IRD:Swine", "11") as label,
replace(replace(replace(replace(regexp_replace(
   rpad(sequence, 2429, "P"),
   "[^ATGC]", ",0"),
   "A", ",1"),
   "T", ",2"),
   "G", ",3"),
   "C", ",4") as train
from genome.influenza_sequence
where host_species = "IRD:Chicken/Avian" 
or host_species = "IRD:Duck/Avian" 
or host_species = "IRD:Mallard/Avian"
or host_species = "IRD:Environment"
or host_species = "IRD:Goose/Avian" 
or host_species = "IRD:Muscovy Duck/Avian" 
or host_species = "IRD:Human" 
or host_species = "IRD:Turkey/Avian" 
or host_species = "IRD:Openbill Stork/Avian" 
or host_species = "IRD:Baikal Teal/Avian" 
or host_species = "IRD:Swine" 
"""

# df = bq.Query(query).execute().result().to_dataframe()

In [18]:

job_name = 'preprocess-babyweight-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')
OUTPUT_DIR = './preprocessed_dataset/'
PROJECT = 'tensorflowprojects'
options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True
  }

opts = beam.pipeline.PipelineOptions(flags=[], **options)
RUNNER = 'DirectRunner'
p = beam.Pipeline(RUNNER, options=opts)

In [19]:
def to_csv(rowdict):
    dataset = str(rowdict['label']) + rowdict['train']

    return dataset

In [20]:
cols = "species, " + ", ".join(["seq_"+str(i) for i in range(2429)])
(p |'read_data' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True))
 | 'preprocessing' >> beam.Map(to_csv)
 | 'proc_data' >> beam.io.Write(beam.io.WriteToText(os.path.join(OUTPUT_DIR, '10_species'), file_name_suffix='.csv', header=cols)))

<PCollection[proc_data/WriteToText/Write/WriteImpl/finalize_write.None] at 0x7f8d59962a90>

In [21]:
job = p.run()

#### csv 파일로 저장된 데이터셋 불러와서 사용가능한 label, train 데이터셋으로 만들기 
* Train(input) : Virus RNA Sequence , Label : host_species (11종, swine 까지)
* 데이터 셔플링 및 training dataset : 10029개, test dataset : 4936개 

In [22]:

path = "./preprocessed_dataset/"
filenames = [os.path.join(path, filename) for filename in os.listdir(path)]


data = [pd.read_csv(f) for f in filenames]
whole_data = pd.concat(data,axis=0)

label = np.array(whole_data["species"])
train = np.array(whole_data.drop("species", axis=1))



In [None]:

# dataset suffling 
train,label = utils.shuffle(train,label, random_state=0)


seed = 7
test_size = 0.33
train_data, test_data, train_label, test_label = train_test_split(train, label, test_size=test_size, random_state=seed)


print("Training_data :",train_data.shape)
print("Training_label :",train_label.shape)

print("Test_data :",test_data.shape)
print("Test_label :",test_label.shape)


### 3. xgboost 모델 

#### xgboost 모델을 적용하여 '분류' 정확도 도출 : DNA sequence 에 따라 host_species 11종 분류 정확도 (accuracy) 

In [None]:

model = XGBClassifier()
model.fit(train_data, train_label)

# make predictions for test data
y_pred = model.predict(test_data)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(test_label, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



## 결과출력 

Training_data : (10020, 2429) <p>
Training_label : (10020,) <p>
Test_data : (4936, 2429) <P>
Test_label : (4936,)<p>
Accuracy: 53.81%