#**스마트폰 센서 데이터 기반 모션 분류**
# 단계2 : 기본 모델링


## 0.미션

* 데이터 전처리
    * 가변수화, 데이터 분할, NaN 확인 및 조치, 스케일링 등 필요한 전처리 수행
* 다양한 알고리즘으로 분류 모델 생성
    * 최소 4개 이상의 알고리즘을 적용하여 모델링 수행 
    * 성능 비교
    * 각 모델의 성능을 저장하는 별도 데이터 프레임을 만들고 비교
* 옵션 : 다음 사항은 선택사항입니다. 시간이 허용하는 범위 내에서 수행하세요.
    * 상위 N개 변수를 선정하여 모델링 및 성능 비교
        * 모델링에 항상 모든 변수가 필요한 것은 아닙니다.
        * 변수 중요도 상위 N개를 선정하여 모델링하고 타 모델과 성능을 비교하세요.
        * 상위 N개를 선택하는 방법은, 변수를 하나씩 늘려가며 모델링 및 성능 검증을 수행하여 적절한 지점을 찾는 것입니다.

## 1.환경설정

### (1) 라이브러리 불러오기

* 세부 요구사항
    - 기본적으로 필요한 라이브러리를 import 하도록 코드가 작성되어 있습니다.
    - 필요하다고 판단되는 라이브러리를 추가하세요.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 필요하다고 판단되는 라이브러리를 추가하세요.




* 함수 생성

In [None]:
# 변수의 특성 중요도 계산하기
def plot_feature_importance(importance, names, result_only = False, topn = 'all'):
    feature_importance = np.array(importance)
    feature_name = np.array(names)

    data={'feature_name':feature_name,'feature_importance':feature_importance}
    fi_temp = pd.DataFrame(data)

    #변수의 특성 중요도 순으로 정렬하기
    fi_temp.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_temp.reset_index(drop=True, inplace = True)

    if topn == 'all' :
        fi_df = fi_temp.copy()
    else :
        fi_df = fi_temp.iloc[:topn]

    #변수의 특성 중요도 그래프로 그리기
    if result_only == False :
        plt.figure(figsize=(10,20))
        sns.barplot(x='feature_importance', y='feature_name', data = fi_df)

        plt.xlabel('importance')
        plt.ylabel('feature name')
        plt.grid()

    return fi_df

### (2) 데이터 불러오기

* 주어진 데이터셋
    * data01_train.csv : 학습 및 검증용
* 세부 요구사항
    - 전체 데이터 'data01_train.csv' 를 불러와 'data' 이름으로 저장합니다.
        - data에서 변수 subject는 삭제합니다.
    - 데이터프레임에 대한 기본 정보를 확인합니다.( .head(), .shape 등)

#### 1) 데이터 로딩

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/KT_aivle/미니프로젝트_5차/data01_train.csv')

In [None]:
data = data.drop('subject', axis=1)
data.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Activity
0,0.288508,-0.009196,-0.103362,-0.988986,-0.962797,-0.967422,-0.989,-0.962596,-0.96565,-0.929747,...,-0.487737,-0.816696,-0.042494,-0.044218,0.307873,0.07279,-0.60112,0.331298,0.165163,STANDING
1,0.265757,-0.016576,-0.098163,-0.989551,-0.994636,-0.987435,-0.990189,-0.99387,-0.987558,-0.937337,...,-0.23782,-0.693515,-0.062899,0.388459,-0.765014,0.771524,0.345205,-0.769186,-0.147944,LAYING
2,0.278709,-0.014511,-0.108717,-0.99772,-0.981088,-0.994008,-0.997934,-0.982187,-0.995017,-0.942584,...,-0.535287,-0.829311,0.000265,-0.525022,-0.891875,0.021528,-0.833564,0.202434,-0.032755,STANDING
3,0.289795,-0.035536,-0.150354,-0.231727,-0.006412,-0.338117,-0.273557,0.014245,-0.347916,0.008288,...,-0.004012,-0.408956,-0.255125,0.612804,0.747381,-0.072944,-0.695819,0.287154,0.111388,WALKING
4,0.394807,0.034098,0.091229,0.088489,-0.106636,-0.388502,-0.010469,-0.10968,-0.346372,0.584131,...,-0.157832,-0.563437,-0.044344,-0.845268,-0.97465,-0.887846,-0.705029,0.264952,0.137758,WALKING_DOWNSTAIRS


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5881 entries, 0 to 5880
Columns: 562 entries, tBodyAcc-mean()-X to Activity
dtypes: float64(561), object(1)
memory usage: 25.2+ MB


In [None]:
data.shape

(5881, 562)

#### 2) 기본 정보 조회

In [None]:
data.describe()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
count,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,...,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0,5881.0
mean,0.274811,-0.017799,-0.109396,-0.603138,-0.509815,-0.604058,-0.628151,-0.525944,-0.605374,-0.46549,...,0.126955,-0.305883,-0.623548,0.008524,-0.001185,0.00934,-0.007099,-0.491501,0.059299,-0.054594
std,0.067614,0.039422,0.058373,0.448807,0.501815,0.417319,0.424345,0.485115,0.413043,0.544995,...,0.249176,0.322808,0.310371,0.33973,0.447197,0.60819,0.476738,0.509069,0.29734,0.278479
min,-0.503823,-0.684893,-1.0,-1.0,-0.999844,-0.999667,-1.0,-0.999419,-1.0,-1.0,...,-0.965725,-0.979261,-0.999765,-0.97658,-1.0,-1.0,-1.0,-1.0,-1.0,-0.980143
25%,0.262919,-0.024877,-0.121051,-0.992774,-0.97768,-0.980127,-0.993602,-0.977865,-0.980112,-0.936067,...,-0.02161,-0.541969,-0.845985,-0.122361,-0.294369,-0.481718,-0.373345,-0.811397,-0.018203,-0.141555
50%,0.277154,-0.017221,-0.108781,-0.943933,-0.844575,-0.856352,-0.948501,-0.849266,-0.849896,-0.878729,...,0.133887,-0.342923,-0.712677,0.010278,0.005146,0.011448,-0.000847,-0.709441,0.182893,0.003951
75%,0.288526,-0.01092,-0.098163,-0.24213,-0.034499,-0.26269,-0.291138,-0.068857,-0.268539,-0.01369,...,0.288944,-0.127371,-0.501158,0.154985,0.28503,0.499857,0.356236,-0.51133,0.248435,0.111932
max,1.0,1.0,1.0,1.0,0.916238,1.0,1.0,0.967664,1.0,1.0,...,0.9467,0.989538,0.956845,1.0,1.0,0.998702,0.996078,0.977344,0.478157,1.0


In [None]:
sum(data.isnull().sum())

0

## **2. 데이터 전처리**

* 가변수화, 데이터 분할, NaN 확인 및 조치, 스케일링 등 필요한 전처리를 수행한다. 


### (1) 데이터 분할1 : x, y

* 세부 요구사항
    - x, y로 분할합니다.

In [None]:
x = data.drop('Activity', axis=1)
y = data['Activity']

### (2) 스케일링(필요시)


* 세부 요구사항
    - 스케일링을 필요로 하는 알고리즘 사용을 위해서 코드 수행
    - min-max 방식 혹은 standard 방식 중 한가지 사용.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=1)
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

### (3) 데이터분할2 : train, validation

* 세부 요구사항
    - train : val = 8 : 2 혹은 7 : 3
    - random_state 옵션을 사용하여 다른 모델과 비교를 위해 성능이 재현되도록 합니다.

## **3. 기본 모델링**



* 세부 요구사항
    - 최소 4개 이상의 알고리즘을 적용하여 모델링을 수행한다. 
    - 각 알고리즘별로 전체 변수로 모델링, 상위 N개 변수를 선택하여 모델링을 수행하고 성능 비교를 한다.
    - (옵션) 알고리즘 중 1~2개에 대해서, 변수 중요도 상위 N개를 선정하여 모델링하고 타 모델과 성능을 비교.
        * 상위 N개를 선택하는 방법은, 변수를 하나씩 늘려가며 모델링 및 성능 검증을 수행하여 적절한 지점을 찾는 것이다.

### (1) 알고리즘1 : 

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=2, loss_function='MultiClass')
cb.fit(X_train, y_train)
y_pred = cb.predict(X_val)
from sklearn.metrics import *
print(classification_report(y_val, y_pred))

0:	learn: 1.5994609	total: 161ms	remaining: 2m 41s
1:	learn: 1.4683431	total: 271ms	remaining: 2m 15s
2:	learn: 1.3668817	total: 395ms	remaining: 2m 11s
3:	learn: 1.2841373	total: 518ms	remaining: 2m 8s
4:	learn: 1.2045864	total: 639ms	remaining: 2m 7s
5:	learn: 1.1287567	total: 741ms	remaining: 2m 2s
6:	learn: 1.0734390	total: 852ms	remaining: 2m
7:	learn: 1.0164410	total: 955ms	remaining: 1m 58s
8:	learn: 0.9723860	total: 1.06s	remaining: 1m 57s
9:	learn: 0.9339909	total: 1.16s	remaining: 1m 54s
10:	learn: 0.8959890	total: 1.25s	remaining: 1m 52s
11:	learn: 0.8649788	total: 1.34s	remaining: 1m 50s
12:	learn: 0.8293389	total: 1.45s	remaining: 1m 50s
13:	learn: 0.8050868	total: 1.56s	remaining: 1m 49s
14:	learn: 0.7717795	total: 1.68s	remaining: 1m 50s
15:	learn: 0.7371342	total: 1.79s	remaining: 1m 50s
16:	learn: 0.7108162	total: 1.9s	remaining: 1m 49s
17:	learn: 0.6886187	total: 2s	remaining: 1m 49s
18:	learn: 0.6633815	total: 2.11s	remaining: 1m 49s
19:	learn: 0.6418941	total: 2.22s

In [None]:
print(accuracy_score(y_val, y_pred))

0.9782460910944936


### (2) 알고리즘2 : 

In [None]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-3.0.0-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.0.8
  Downloading pyod-1.0.9.tar.gz (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.0/150.0 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting scikit-plot>=0.3.7
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting tbats>=1.1.0
  Downloading tbats-1.1.2-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting pmdarima!=1.8.1,<3.0.0,>=1.8.0
  Downloading pmdarima-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (1.9 MB)
[2K     [90m━━

In [None]:
train, val = train_test_split(data, test_size=0.25, random_state=1)

In [None]:
y_val = val.pop('Activity')
X_val = val

In [None]:
from pycaret.classification import *

# 모델 생성 및 설정
clf = setup(data = train, target = 'Activity', session_id = 123, use_gpu=True)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Activity
2,Target type,Multiclass
3,Target mapping,"LAYING: 0, SITTING: 1, STANDING: 2, WALKING: 3, WALKING_DOWNSTAIRS: 4, WALKING_UPSTAIRS: 5"
4,Original data shape,"(4410, 562)"
5,Transformed data shape,"(4410, 562)"
6,Transformed train set shape,"(3087, 562)"
7,Transformed test set shape,"(1323, 562)"
8,Numeric features,561
9,Preprocess,True


In [None]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [None]:
# 비교할 다양한 모델들 생성
best_model = compare_models(n_select=8, sort='Accuracy', include=['lr','svm','rbfsvm','et','xgboost','lightgbm'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9835,0.9997,0.9835,0.9837,0.9835,0.9801,0.9802,0.781
xgboost,Extreme Gradient Boosting,0.9809,0.9995,0.9809,0.9811,0.9809,0.977,0.9771,0.363
lr,Logistic Regression,0.9806,0.9992,0.9806,0.9808,0.9805,0.9766,0.9767,0.373
et,Extra Trees Classifier,0.9757,0.9991,0.9757,0.9762,0.9757,0.9708,0.9709,0.471
svm,SVM - Linear Kernel,0.9715,0.0,0.9715,0.9732,0.9714,0.9657,0.9661,0.338
rbfsvm,SVM - Radial Kernel,0.9281,0.9947,0.9281,0.9288,0.9277,0.9134,0.9137,2.076


Processing:   0%|          | 0/34 [00:00<?, ?it/s]

In [None]:
# tune_model
from lightgbm import LGBMClassifier
LGBM = LGBMClassifier()
tune_LGBM = tune_model(LGBM)

# 모델 성능 평가
y_pred = predict_model(tune_LGBM, data = X_val)
print(classification_report(y_val, y_pred))

In [None]:
pd.sort_values()

### (3) 알고리즘3 : 

In [None]:
x = data.drop('Activity', axis=1)
y = data['Activity']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=1)
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=1, max_depth=50, n_estimators=200)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
print(classification_report(y_val, y_pred))

                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       272
           SITTING       0.96      0.96      0.96       247
          STANDING       0.96      0.96      0.96       279
           WALKING       0.99      0.98      0.99       249
WALKING_DOWNSTAIRS       0.97      0.98      0.98       197
  WALKING_UPSTAIRS       0.98      0.99      0.98       227

          accuracy                           0.98      1471
         macro avg       0.98      0.98      0.98      1471
      weighted avg       0.98      0.98      0.98      1471



In [None]:
from lightgbm import LGBMClassifier
LGBM = LGBMClassifier(random_state=1)
LGBM.fit(X_train, y_train)
y_pred = LGBM.predict(X_val)
print(classification_report(y_val, y_pred))

                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       272
           SITTING       0.98      0.98      0.98       247
          STANDING       0.98      0.98      0.98       279
           WALKING       1.00      0.99      1.00       249
WALKING_DOWNSTAIRS       0.99      1.00      1.00       197
  WALKING_UPSTAIRS       1.00      1.00      1.00       227

          accuracy                           0.99      1471
         macro avg       0.99      0.99      0.99      1471
      weighted avg       0.99      0.99      0.99      1471



### (4) 알고리즘4 : 

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv1D, MaxPool1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
X_train.shape, y_train.shape

((4410, 561), (4410,))

In [None]:
il = Input(shape=(561,))
hl = Dense(63, activation='relu')(il)
hl = Dropout(0.2)(hl)
hl = Dense(63, activation='relu')(hl)
hl = Dropout(0.2)(hl)
hl = Dense(63, activation='relu')(hl)
hl = Dropout(0.2)(hl)
ol = Dense(6, activation='softmax')(hl)

model = Model(il, ol)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss',
                   min_delta=0,
                   patience=5,
                   verbose=1,
                   restore_best_weights=True)


In [None]:
model.fit(X_train, y_train, validation_split=0.2, epochs=100, callbacks=[es])


Epoch 1/100


UnimplementedError: ignored

In [None]:
pred = model.predict(X_val)