In [2]:
# import package
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
path_root = os.path.dirname(os.path.abspath('')) # get path root folder
path_dataset = os.path.join(path_root, 'dataset', 'dataset.xlsx')
path_result_file = os.path.join(path_root, 'result', 'customer')
file_save_result = os.path.join(path_result_file, 'result.csv')

In [None]:
def load_sensor_data(path):
    return pd.read_excel(path, skiprows = range(0, 2))

def drop_column(df):
    x = [0, 1, 2, 3, 4] #  0: Unnamed-0, Unnamed-1, Unnamed-2,  データＩＤ, Time
    df_droped = df.drop(df.columns[x], axis=1)
    return df_droped


# Task 1: Loading data from file excel 
data = load_sensor_data(path=path_dataset)

# Task 2: change name column
data_copy = data.copy()
data_copy.rename(columns={'Unnamed: 0': 'datecheck-ID', 
                          'Unnamed: 1': 'shoes-ID',
                          'Unnamed: 2': 'balance-status',
                          'データＩＤ':'set-ID',
                         }, inplace = True)

# Task 3: removing feature which relate to train
data_copy = drop_column(df=data_copy)

In [None]:
# checking skew, kurt 
from scipy.special import boxcox1p
from scipy.stats import norm, skew, kurtosis #for some statistics

def tranform_data(value):
    return np.sign(value) * np.log(np.abs(value) + 1)

def show_skew_kurt(data_show):
    numeric_feats = data_show.dtypes[data_show.dtypes != "object"].index
    # Check the skew of all numerical features
    skewed_feats = data_show[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    kurt_feats = data_show[numeric_feats].apply(lambda x: kurtosis(x.dropna())).sort_values(ascending=False)
    print("\nSkew in numerical features: \n")
    skewness = pd.DataFrame({'Skew' :skewed_feats, 'Kurt': kurt_feats})
    return skewness
    
def standardiziation(dataframe, skewness):
    skewness = skewness[abs(skewness) > 0.75]
    print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

    skewed_features = skewness.index
    lam = 0.15
    for feat in skewed_features:
#         dataframe[feat] = boxcox1p(dataframe[feat], lam)
        dataframe[feat] = tranform_data(dataframe[feat])
    return dataframe
    

In [None]:
skewness = show_skew_kurt(data_copy)

In [None]:
data_frame = standardiziation(data_copy.copy(), skewness)

In [None]:
show_skew_kurt(data_frame)

In [None]:
data_frame.info()

In [None]:
############################

In [10]:
# Saving ... 
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data_frame, test_size=0.2, random_state=42)

label_columns = ["L-FX", "L-FY", "L-FZ", "R-FX", "R-FY", "R-FZ"]
data_training = train_set.drop(label_columns, axis=1)
data_labels = train_set[label_columns].copy()

data_test = test_set.drop(label_columns, axis=1)
data_test_labels = test_set[label_columns].copy()

In [11]:
print("the amount of data train: %s - the amount of data test: %s" % (data_training.shape, data_test.shape))
print("the amount of labels train: %s - the amount of labels test: %s" % (data_labels.shape, data_test_labels.shape))

the amount of data train: (189110, 48) - the amount of data test: (47278, 48)
the amount of labels train: (189110, 6) - the amount of labels test: (47278, 6)


In [12]:
# building Model 
n_folds = 5

def rmsle_cv(model, train, label):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    mse = -cross_val_score(model, train.values, label.values, scoring="neg_mean_squared_error", cv = kf)
    return (mse)


In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

# init model 
linearmodel = LinearRegression()
lasso = make_pipeline(StandardScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(StandardScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)



In [14]:
score = rmsle_cv(lasso, data_training, data_labels)
print(score)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

[3.16113352 3.15686199 3.1758528  3.17151857 3.16170399]

Lasso score: 3.1654 (0.0071)



In [15]:
score = rmsle_cv(linearmodel, data_training, data_labels)
print(score)
print("\nLinear model score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

[3.16117154 3.15687129 3.17582545 3.17149486 3.16167333]

Linear model score: 3.1654 (0.0071)



In [16]:
score = rmsle_cv(ENet, data_training, data_labels)
print(score)
print("\nElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

[3.16113547 3.15686098 3.1758485  3.1715155  3.16169985]

ElasticNet score: 3.1654 (0.0071)



In [1]:
score = rmsle_cv(KRR, data_training, data_labels)
print(score)
print("\nKernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

NameError: name 'rmsle_cv' is not defined

In [None]:
score = rmsle_cv(GBoost)
print(score)
print("\nGradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))