In [1]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd

import optuna
from random import shuffle
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.pipeline import Pipeline ,make_pipeline
from sklearn import impute
from sklearn import compose
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix,roc_auc_score
from sklearn import set_config
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler,Normalizer,PowerTransformer,QuantileTransformer, RobustScaler,StandardScaler,LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.tree          import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression    
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import sklearn
set_config(display='diagram') # Useful for display the pipeline
print("Pandas  ", pd.__version__)
print("Sklearn ", sklearn.__version__) # Try to use 0.24



Pandas   1.2.4
Sklearn  0.24.2


In [11]:
data = pd.read_csv('./data/data_5secondWindow .csv')

In [None]:
data.describe()

In [None]:
data.groupby('target').size()

In [12]:
def change_name(df):
    column_names=[]
    for i in df.columns:
        k=i.replace('.','_').replace("#",'_')
        df.rename(columns = {i:k}, inplace = True)
    return df
data=change_name(data)

In [19]:
data=data[data['target']!='Bus']
data=data[data['target']!='Train']

In [None]:
features=['android_sensor_gyroscope_mean',
'android_sensor_accelerometer_std',
'android_sensor_gyroscope_uncalibrated_mean',
'android_sensor_linear_acceleration_mean',
'android_sensor_gyroscope_uncalibrated_max',
'android_sensor_gyroscope_max',
'speed_max'  ,  
'android_sensor_rotation_vector_mean'   ,
'android_sensor_accelerometer_min',
'android_sensor_magnetic_field_uncalibrated_min',
'sound_min',
'target',
'user']

In [None]:
# data=data[features]

In [20]:
# create test data from some user 
def split_data(df):
    l=df.user.unique().tolist()
    # create random list of indi
    shuffle(l)
    train_user=l[2:]
    test_user=l[:2]
    # get splitting indicies
#     train=data[data['user'].isin(train_user)]
#     test = data[data['user'].isin(test_user)]
    train=df[df['user']!='IvanHeibi']
    test=df[df['user']=='IvanHeibi'] 
    return train, test

In [21]:
df_train,df_test=split_data(data)


In [22]:
train_missing = (df_train.isnull().sum() / len(df_train)).sort_values(ascending = False)
test_missing = (df_test.isnull().sum() / len(df_test)).sort_values(ascending = False)

train_missing = train_missing.index[train_missing > 0.75]
test_missing = test_missing.index[test_missing > 0.75]

all_missing = list(set(set(train_missing) | set(test_missing)))
print('There are %d columns with more than 75%% missing values' % len(all_missing))

There are 8 columns with more than 75% missing values


In [23]:
# Drop missing data 
df_train = df_train.drop(all_missing,axis=1)
df_test  = df_test.drop(all_missing,axis=1)


In [24]:
unimp    = ['user','id','time']
df_train = df_train.drop(unimp,axis=1)
df_test  = df_test.drop(unimp,axis=1)

In [25]:
print(df_train.shape)
print(df_test.shape)

(3265, 59)
(271, 59)


In [26]:
X=df_train.drop('target',axis=1)
y=df_train.target
X_test=df_test.drop('target',axis=1)
y_test=df_test.target

In [27]:
# pipline and PcA
# pca=PCA(n_components=15)
# scaler=StandardScaler()
# encoder = LabelEncoder()
# onecoder=OneHotEncoder()
num_var=X.columns.to_list()
num_pip=Pipeline([('imputer',impute.SimpleImputer(strategy='mean')),
                   ('scalar',MinMaxScaler(feature_range=(0,20))),
                   ('PCA' ,PCA(n_components = 'mle' , svd_solver = 'full'))
               ]) 
preprocessor=compose.ColumnTransformer([
    ('num',num_pip,num_var)
])

In [None]:
# X_sc=preprocessor.fit_transform(X)
# X_test_sc=preprocessor.fit_transform(X_test)
# X_sc = pd.DataFrame(X_sc, columns=X.columns)
# X_test_sc = pd.DataFrame(X_test_sc, columns=X_test.columns)

In [None]:
# y_train = np.reshape(y_train, (y_train.shape[0]))
# y_test = np.reshape(y_test, (y_test.shape[0]))

In [28]:
models = {
"DecisionTreeClassifier": DecisionTreeClassifier(),
'KNeighborsClassifier':KNeighborsClassifier(6),
'LogisticRegression':LogisticRegression(),
"RandomForestClassifier":RandomForestClassifier(),
"GradientBoostingClassifier":GradientBoostingClassifier(),
"XGBClassifier":XGBClassifier(),
"LGBMClassifier":LGBMClassifier(),
'GaussianNB':GaussianNB(),

}
# make pipline with  preprocessing 
classifiers = {name:make_pipeline(preprocessor, model) for name, model in models.items()}
classifiers["GradientBoostingClassifier"]

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, y,test_size=0.2,stratify = y,random_state=10 )
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [],'Time': []})
for  name ,model in classifiers.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
    valid_pred = model.predict(x_val)
    results = results.append({"Model":    name,
                              "Accuracy": accuracy_score(y_val, valid_pred)*100,
                              "Bal Acc.": balanced_accuracy_score(y_val, valid_pred)*100,
                    
                              "Time":     total_time},
                              ignore_index=True)
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

In [None]:
best_model=LGBMClassifier(max_depth= 28
, n_estimators= 300, num_leaves=70,min_data_in_leaf=110,learning_rate= 0.069)
name='LGBMClassifier'
clf=Pipeline([('pre',preprocessor),('classification',best_model)])
# 'boosting_type': 'gbdt', 'max_depth': 28, 'n_estimators': 388, 'learning_rate': 0.03432830269529964, 'min_data_in_leaf': 104

In [None]:
test_result = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [],'Time': []})
start_time = time.time()
best_model.fit(X, y)
total_time = time.time() - start_time
pred = best_model.predict(X_test)
test_result = test_result.append({"Model":    name,
                          "Accuracy": accuracy_score(y_test, pred)*100,
                          "Bal Acc.": balanced_accuracy_score(y_test, pred)*100,
                          "Time":     total_time},
                          ignore_index=True)
test_result_ord = test_result.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
test_result_ord.index += 1 
test_result_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')


In [None]:
print(classification_report(y_test, pred))
print()
print(confusion_matrix(y_test, pred))

# Optuna find best parameter for selected model

In [None]:

def objective(trial):

    boosting_type = trial.suggest_categorical("boosting_type", ["gbdt", "rf"])
    max_depth = trial.suggest_int('max_depth', 1, 32)
    n_estimators = trial.suggest_int("n_estimators", 100,500)
    learning_rate=trial.suggest_float("learning_rate", 0.001,0.1)
    min_data_in_leaf= trial.suggest_int("min_data_in_leaf", 10,500)
    
    lg_lgbm = LGBMClassifier( boosting_type=boosting_type, max_depth=max_depth, 
            n_estimators=n_estimators,learning_rate=learning_rate,min_data_in_leaf=min_data_in_leaf
        )

    score = cross_val_score(lg_lgbm, X,y, n_jobs=-1, cv=2)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))
