In [1]:
import pandas as pd

def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) 
                                                                                           if x[1] > 0 else x[0] ,  axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df


def get_human_dataset():
    feature_name_df = pd.read_csv('../data/UCI HAR Dataset/features.txt', sep='\s+', 
                              header=None, names=['column_index', 'column_name'])
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
    X_train = pd.read_csv('../data/UCI HAR Dataset/train/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('../data/UCI HAR Dataset/test/X_test.txt', sep='\s+', names=feature_name)
    
    y_train = pd.read_csv('../data/UCI HAR Dataset/train/y_train.txt', sep='\s+', header=None, names=['action'])
    y_test = pd.read_csv('../data/UCI HAR Dataset/test/y_test.txt', sep='\s+', header=None, names=['action'])
    
    return X_train, X_test, y_train, y_test

In [2]:
X_train, X_test, y_train, y_test = get_human_dataset()

In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import time
import warnings
warnings.filterwarnings('ignore')

start_time = time.time()
gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train, y_train)
pred = gb_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print(f'accuracy : {accuracy:.4f}')
print(f'time : {time.time() - start_time:.4f} sec')

accuracy : 0.9389
time : 571.8220 sec


In [4]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100, 500],
    'learning_rate': [0.05, 0.1]
}

grid_cv = GridSearchCV(gb_clf, param_grid=params, cv=2, verbose=1)
grid_cv.fit(X_train, y_train)
print(f'best parameter : {grid_cv.best_params_}')
print(f'best accuracy : {grid_cv.best_score_:.4f}')

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 106.5min finished


best parameter : {'learning_rate': 0.1, 'n_estimators': 500}
best accuracy : 0.9011


In [5]:
scores_df = pd.DataFrame(grid_cv.cv_results_)
scores_df[
    ['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score']
]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score
0,"{'learning_rate': 0.05, 'n_estimators': 100}",0.899619,3,0.886834,0.912405
1,"{'learning_rate': 0.05, 'n_estimators': 500}",0.900299,2,0.890098,0.910501
2,"{'learning_rate': 0.1, 'n_estimators': 100}",0.898939,4,0.886561,0.911317
3,"{'learning_rate': 0.1, 'n_estimators': 500}",0.901115,1,0.889826,0.912405


In [6]:
pred = grid_cv.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print(f'accuracy : {accuracy:.4f}')

accuracy : 0.9420
