# プロジェクト

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import warnings

from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
from sklearn.manifold import TSNE
from sklearn.svm import SVC,LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier,SGDClassifier,PassiveAggressiveClassifier,LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,StackingClassifier,ExtraTreesClassifier
from sklearn.feature_selection import RFECV,chi2,SelectKBest, SelectPercentile
from sklearn.gaussian_process import GaussianProcessClassifier
from xgboost.sklearn import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

* Data

In [None]:
root_dir= "data/"
nsample = 3000
features = list()
with open('features.txt') as f:
    features = [line.split()[1] for line in f.readlines()]
print('No of Features: {}'.format(len(features)))

In [None]:
# 白ワインデータセットの読み込み
X_train = pd.read_csv(root_dir+"X_train.csv",header=None)
# X_train.columns=features
y_train = pd.read_csv(root_dir+"y_train.csv",names=['Activity'],header=None, squeeze=True)
X_test = pd.read_csv(root_dir+"X_test.csv",header=None)
# X_test.columns=features

# train = X_train.copy()
# y_train_labels = y_train.map({1: 'WALKING', 2:'WALKING_UPSTAIRS',3:'WALKING_DOWNSTAIRS',\
#                 4:'SITTING', 5:'STANDING',6:'LAYING'})
# train['subject'] = pd.read_csv('subject_train.csv',header=None)
# train['Activity'] = y_train
# train['ActivityName'] = y_train_labels



In [None]:
# X_train.hist(figsize=(50,50))

In [None]:
# X_train.skew().sort_values(ascending=False).head(10)

In [None]:
# X_train.std().sort_values(ascending=False).head(10)

In [None]:
# print('No of duplicates in train: {}'.format(sum(X_train.duplicated())))
# print('No of duplicates in test : {}'.format(sum(X_test.duplicated())))
# print('We have {} NaN/Null values in train'.format(X_train.isnull().values.sum()))
# print('We have {} NaN/Null values in test'.format(X_test.isnull().values.sum()))


In [None]:
# y_train.hist(figsize=(5,5))

In [None]:
# sns.set_style('whitegrid')
# plt.rcParams['font.family'] = 'Dejavu Sans'

# plt.figure(figsize=(16,8))
# plt.title('Data provided by each user', fontsize=20)
# sns.countplot(x='subject',hue='ActivityName', data = train)
# plt.show()

In [None]:
# X_train.corr()['Activity'].sort_values().head(20)

* Feature Engineering

In [None]:
corr = X_train.corr()
flag = np.full((corr.shape[0]),True, dtype=bool)
for i in range(corr. shape[0]):
    for j in range(i+1, corr. shape[0]):
        if corr.iloc[i,j] >= 0.85:
            if flag[j]:
                flag[j] = False
select = X_train.columns[flag].tolist()
X_train = X_train[select]
X_test = X_test[select]
print(X_train.columns)

* Outliers検出

In [None]:
# X_train['Activity'] = y_train
# for i in X_train.columns.values:
#     u = np.median(X_train[i])
#     s = np.std(X_train[i])
#     idx = (X_train[i]<u+5*s)&(X_train[i]>u-5*s)
#     X_train = X_train[idx]
#     y_train = y_train[idx]
# X_train.shape

* Define classes

In [None]:
# standard_scaler = StandardScaler()
# standard_scaler.fit(X_train)
# X = standard_scaler.transform(X_train)
X = X_train
y = y_train

# 訓練データとテストデータに分割
X_train,X_val, y_train,y_val= train_test_split(X, y, test_size=0.2, random_state=42)

y_train = np.array(y_train).flatten()
X_train = np.array(X_train)

y_val = np.array(y_val).flatten()
X_val = np.array(X_val)

X_test = np.array(X_test)
# X_test = standard_scaler.transform(X_test)

X_train.shape,y_train.shape,X_val.shape,y_val.shape

* Models after hyperparameter tuning

In [None]:
models = {'DecisionTreeClassifier    ':DecisionTreeClassifier(),
          'SVC                       ':SVC(),
          'LogisticRegression        ':LogisticRegression(),
          'GradientBoostingClassifier':GradientBoostingClassifier(),
          'Ridge                     ':RidgeClassifier(),
          'LGBMClassifier            ':LGBMClassifier(),
          'RandomForestClassifier    ':RandomForestClassifier(),
          'XGBClassifier             ':XGBClassifier()}

{'DecisionTreeClassifier    ': 'Train: 1.0000, Val: 0.9331',
 'SVC                       ': 'Train: 0.9870, Val: 0.9823',
 'LogisticRegression        ': 'Train: 0.9983, Val: 0.9803',
 'GradientBoostingClassifier': 'Train: 1.0000, Val: 0.9921',
 'LGBMClassifier            ': 'Train: 1.0000, Val: 0.9931',
 'RandomForestClassifier    ': 'Train: 1.0000, Val: 0.9754',
 'XGBClassifier             ': 'Train: 1.0000, Val: 0.9931'}

In [None]:
# 分類モデル
# scorelist = {}
# for key, model in models.items():
#     model.fit(X_train, y_train) # 訓練データで学習
#     yHatTrain = model.predict(X_train)
#     yHatVal = model.predict(X_val)
#     scorelist[key] = 'Train: {:.4f}, Val: {:.4f}'.format(len((np.where(yHatTrain == y_train))[0])*1.0/X_train.shape[0],
#                         len((np.where(yHatVal == y_val))[0])*1.0/X_val.shape[0])
#     yHatTest = model.predict(X_test)
#     np.savetxt(root_dir+'fs0.84result_'+key+'.txt', yHatTest)
# scorelist

In [None]:
# penalty='l1',dual=False
# parameters = {
#               'max_depth': [1, 5, 10, 15, 20, 25, 30, 35],
#               'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
#               'feature_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
#               'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 0.95],
#               'bagging_freq': [2, 4, 5, 6, 8],
#               'lambda_l1': [0, 0.1, 0.4, 0.5, 0.6],
#               'lambda_l2': [0, 10, 15, 35, 40],
#               'cat_smooth': [1, 10, 15, 20, 35]
# }7,min_child_weight=2,reg_alpha=0.6,scale_pos_weight=1
# model = XGBClassifier()
# model.fit(X_train, y_train)
# yHatTrain = model.predict(X_train)
# yHatVal = model.predict(X_val)

# print('Train: {:.6f}, Val: {:.6f}'.format(len((np.where(yHatTrain == y_train))[0])*1.0/X_train.shape[0],
#                         len((np.where(yHatVal == y_val))[0])*1.0/X_val.shape[0]))
# yHatTest = model.predict(X_test)
# np.savetxt(root_dir+'fs0.85ssresultXGBClassifier.txt', yHatTest)

In [None]:
# paralist = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in range(25,31):
    model = LGBMClassifier(max_depth=18,min_child_samples=i)
    model.fit(X_train, y_train)
    yHatTrain = model.predict(X_train)
    yHatVal = model.predict(X_val)
    print('mcs: {}, Train: {:.6f}, Val: {:.6f}'.format(i, len((np.where(yHatTrain == y_train))[0])*1.0/X_train.shape[0],
                        len((np.where(yHatVal == y_val))[0])*1.0/X_val.shape[0]))
    yHatTest = model.predict(X_test)
    np.savetxt(root_dir+'result_LGBM_mcs'+ str(i) + '.txt', yHatTest)
