In [1]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.pipeline import Pipeline ,make_pipeline
from sklearn import impute
from sklearn import compose
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix,roc_auc_score
from sklearn import set_config
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler,Normalizer,PowerTransformer,QuantileTransformer, RobustScaler,StandardScaler,LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.tree          import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression    
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import sklearn
set_config(display='diagram') # Useful for display the pipeline
print("Pandas  ", pd.__version__)
print("Sklearn ", sklearn.__version__) # Try to use 0.24



Pandas   1.2.4
Sklearn  0.24.2


In [3]:
data = pd.read_csv('./data/data_5secondWindow .csv')
data.head(5)

Unnamed: 0,id,time,activityrecognition#0,activityrecognition#1,android.sensor.accelerometer#mean,android.sensor.accelerometer#min,android.sensor.accelerometer#max,android.sensor.accelerometer#std,android.sensor.game_rotation_vector#mean,android.sensor.game_rotation_vector#min,...,sound#mean,sound#min,sound#max,sound#std,speed#mean,speed#min,speed#max,speed#std,target,user
0,16170,78.0,,100.0,9.811476,9.758895,9.849411,0.014626,0.02934,0.029014,...,,,,,0.0,0.0,0.0,0.0,Still,Luca
1,15871,145.0,,100.0,9.939207,7.707437,17.146631,1.775944,0.999925,0.999903,...,89.20021,89.065143,89.335277,0.191013,16.539349,16.539349,16.539349,0.628595,Car,Luca
2,16811,150.0,,100.0,9.827178,9.804817,9.849262,0.011199,0.665215,0.665213,...,,,,,0.0,0.0,0.0,0.0,Still,Luca
3,15831,105.0,,77.0,9.673039,7.659674,12.304298,0.862553,0.996221,0.993781,...,87.470377,87.470377,87.470377,2.284186,17.739895,17.739895,17.739895,0.628595,Car,Luca
4,876,77.0,,100.0,9.993466,8.965621,10.891645,0.504117,0.563792,0.521799,...,89.770732,89.770732,89.770732,0.006389,9.0,9.0,9.0,,Car,andrea


In [6]:
# change name of columns 
def change_name(df):
    column_names=[]
    for i in df.columns:
        k=i.replace('.','_').replace("#",'_')
        df.rename(columns = {i:k}, inplace = True)
    return df
data=change_name(data)

In [7]:
features=[
'id',    
'user',
'android_sensor_gyroscope_mean',
'android_sensor_accelerometer_std',
'android_sensor_gyroscope_std',
'android_sensor_gyroscope_uncalibrated_mean',
'android_sensor_accelerometer_max',
'android_sensor_linear_acceleration_mean',
'speed_mean',
'android_sensor_rotation_vector_mean',
'android_sensor_rotation_vector_max',
'android_sensor_accelerometer_min',
'android_sensor_magnetic_field_uncalibrated_min',
'sound_min',
'target'
]

In [8]:
data=data[features]

# Create test data 

In [9]:
# create test data from some user 
df=data.copy()
user1=df[df['user']=='Pierpaolo']
user2=df[df['user']=='IvanHeibi']
user3=df[df['user']=='AndreaCarpineti']
user4=df[df['user']=='Elena']
users=[user1,user2,user3,user4]
df_test=pd.concat(users)
df_test.head(5)

Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,speed_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target
33,19919,Pierpaolo,0.615399,1.283204,0.655962,0.778458,14.028126,1.721314,1.19105,0.955202,0.955202,9.764513,36.378866,77.064227,Walking
54,19494,Pierpaolo,0.447802,0.361961,0.116564,0.334574,9.963264,0.9582,0.0,0.983561,0.989576,8.712177,77.496043,74.178611,Bus
102,20104,Pierpaolo,0.069503,0.070605,0.04814,0.078626,9.18734,0.868719,0.0,0.78757,0.802811,9.012243,91.951478,6.0206,Walking
121,19436,Pierpaolo,0.045041,0.451773,0.027367,0.038644,9.563967,0.958079,7.800006,0.993876,0.996117,8.24622,70.436446,72.5596,Bus
172,20245,Pierpaolo,1.734294,4.272022,1.400249,1.331566,17.743662,8.823918,1.25,0.985658,0.993053,3.332163,90.118737,73.702289,Walking


## Subtract test data from data 

In [10]:
df_train = df.merge(df_test, how='left', indicator=True)
df_train = df_train[df_train['_merge'] == 'left_only']
df_train=df_train.drop('_merge',axis=1)
df_train.head(5)

Unnamed: 0,id,user,android_sensor_gyroscope_mean,android_sensor_accelerometer_std,android_sensor_gyroscope_std,android_sensor_gyroscope_uncalibrated_mean,android_sensor_accelerometer_max,android_sensor_linear_acceleration_mean,speed_mean,android_sensor_rotation_vector_mean,android_sensor_rotation_vector_max,android_sensor_accelerometer_min,android_sensor_magnetic_field_uncalibrated_min,sound_min,target
0,16170,Luca,0.001651,0.014626,0.000737,0.016221,9.849411,0.020978,0.0,0.050413,0.056351,9.758895,51.199707,,Still
1,15871,Luca,0.036326,1.775944,0.02029,0.039023,17.146631,0.87922,16.539349,0.999981,0.999999,7.707437,82.40989,89.065143,Car
2,16811,Luca,0.001525,0.011199,0.000713,0.016302,9.849262,0.03411,0.0,0.610456,0.610456,9.804817,55.501802,,Still
3,15831,Luca,0.03644,0.862553,0.010553,0.050759,12.304298,1.488361,17.739895,0.998112,0.998112,7.659674,95.664309,87.470377,Car
4,876,andrea,0.183202,0.504117,0.098819,0.265652,10.891645,1.658308,9.0,0.322242,0.378193,8.965621,156.795909,89.770732,Car


In [11]:
print(df_train.shape)
print(df_test.shape)

(4985, 15)
(908, 15)


## drop id ,time, user columns

In [12]:
unimp    = ['id','user']
df_train = df_train.drop(unimp,axis=1)
df_test  = df_test.drop(unimp,axis=1)

In [13]:
df_train=df_train.drop('speed_mean',axis=1)
df_test=df_test.drop('speed_mean',axis=1)

In [14]:
print(df_train.shape)
print(df_test.shape)

(4985, 12)
(908, 12)


In [15]:
X=df_train.drop('target',axis=1)
y=df_train.target
X_test=df_test.drop('target',axis=1)
y_test=df_test.target

# Pipeline 

In [16]:
num_attribs=df_train.drop('target',axis=1).columns.to_list()
num_pip=Pipeline([('imputer',impute.SimpleImputer(strategy='median')),
                  ('scalar',MinMaxScaler()),
               ])  # ('PCA',PCA(n_components=12))
preprocessor=compose.ColumnTransformer([
    ('num',num_pip,num_attribs)
])

In [17]:
# pipeline for model
classifier_models = {
"DecisionTreeClassifier": DecisionTreeClassifier(),
'KNeighborsClassifier':KNeighborsClassifier(6),
'LogisticRegression':LogisticRegression(),
"RandomForestClassifier":RandomForestClassifier(),
"GradientBoostingClassifier":GradientBoostingClassifier(),
"XGBClassifier":XGBClassifier(),
"LGBMClassifier":LGBMClassifier(),
'GaussianNB':GaussianNB(),

}
# make pipline with  preprocessing 
classifier_models = {name: make_pipeline(preprocessor, model) for name, model in classifier_models.items()}
classifier_models["GradientBoostingClassifier"]

In [18]:


x_train, x_val, y_train, y_val = train_test_split(X, y,test_size=0.4,stratify = y,random_state=10 )
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [],'Time': []})
for model_name, model in classifier_models.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
    valid_pred = model.predict(x_val)
    results = results.append({"Model":    model_name,
                              "Accuracy": accuracy_score(y_val, valid_pred)*100,
                              "Bal Acc.": balanced_accuracy_score(y_val, valid_pred)*100,
                    
                              "Time":     total_time},
                              ignore_index=True)
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')




Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,LGBMClassifier,91.925777,91.752167,0.982526
2,XGBClassifier,91.825476,91.684729,2.609591
3,RandomForestClassifier,89.869609,89.936961,0.8031
4,GradientBoostingClassifier,89.167503,89.181145,5.651685
5,KNeighborsClassifier,84.052156,84.61007,0.016791
6,DecisionTreeClassifier,83.400201,83.75687,0.062501
7,GaussianNB,57.622869,60.048789,0.016094
8,LogisticRegression,57.372116,59.365775,0.186431


# Best model for test data  

In [19]:
best_model=LGBMClassifier()
name='LGBMClassifier'

In [20]:
clf=Pipeline([('preprocess',preprocessor),('classification',best_model)])
test_result = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [],'Time': []})
start_time = time.time()
clf.fit(X, y)
total_time = time.time() - start_time
pred = clf.predict(X_test)
test_result = test_result.append({"Model":    name,
                          "Accuracy": accuracy_score(y_test, pred)*100,
                          "Bal Acc.": balanced_accuracy_score(y_test, pred)*100,
                          "Time":     total_time},
                          ignore_index=True)
test_result_ord = test_result.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
test_result_ord.index += 1 
test_result_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')



Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,LGBMClassifier,42.400881,49.092232,1.608223


In [21]:
print(classification_report(y_test, pred))
print()
print(confusion_matrix(y_test, pred))

              precision    recall  f1-score   support

         Bus       0.40      0.10      0.17       164
         Car       0.07      0.82      0.13        34
       Still       0.46      0.80      0.59        76
       Train       0.31      0.26      0.28        86
     Walking       0.96      0.47      0.63       548

    accuracy                           0.42       908
   macro avg       0.44      0.49      0.36       908
weighted avg       0.72      0.42      0.49       908


[[ 17 117   7  11  12]
 [  3  28   0   3   0]
 [  1  11  61   3   0]
 [  3  61   0  22   0]
 [ 18 177  64  32 257]]


# with optuna paramater

In [22]:

m_=LGBMClassifier()
model_m=Pipeline([('preprocess',preprocessor),('classification',m_)])

In [23]:
model_m.fit(X,y)

In [51]:
pred_=model_m.predict(X_test)
print(f"Accuracy {accuracy_score(y_test, pred_)*100}")
print(classification_report(y_test, pred_))
print()
print(confusion_matrix(y_test, pred_))

Accuracy 83.81057268722468
              precision    recall  f1-score   support

         Bus       0.73      0.92      0.81       164
         Car       0.44      1.00      0.61        34
       Still       0.65      0.99      0.79        76
       Train       1.00      0.99      0.99        86
     Walking       0.99      0.76      0.86       548

    accuracy                           0.84       908
   macro avg       0.76      0.93      0.81       908
weighted avg       0.89      0.84      0.85       908


[[151   8   0   0   5]
 [  0  34   0   0   0]
 [  0   0  75   0   1]
 [  0   1   0  85   0]
 [ 57  35  40   0 416]]


#  Drop some feature from target  beacuse of confusion materix result 

## Drop car  category 

In [24]:
df_test_1   =df_test[df_test['target']!='Car']
df_train_1  =df_train[df_train['target']!='Car']

In [25]:
print(df_test.shape)
print(df_train.shape)

(908, 12)
(4985, 12)


# How much info reduced 

In [26]:
print(f" After drop Car category we  reduce \n test data {100-df_test_1.shape[0]/df_test.shape[0]*100} % \n train data {100-df_train_1.shape[0]/df_train.shape[0]*100} %")

 After drop Car category we  reduce 
 test data 3.7444933920704813 % 
 train data 22.988966900702096 %


In [27]:
print(df_train_1.shape)
print(df_test_1.shape)

(3839, 12)
(874, 12)


In [28]:
X_=df_train_1.drop('target',axis=1)
y_=df_train_1.target
X_test_=df_test_1.drop('target',axis=1)
y_test_=df_test_1.target

# train data 

In [29]:
x_train_, x_val_, y_train_, y_val_ = train_test_split(X_, y_,test_size=0.4,stratify = y_,random_state=10 )
result_new = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [],'Time': []})
for model_name, model in classifier_models.items():
    start_time_ = time.time()
    model.fit(x_train_, y_train_)
    total_time_= time.time() - start_time_
    valid_pred_new = model.predict(x_val_)
    result_new = result_new.append({"Model":    model_name,
                              "Accuracy": accuracy_score(y_val_, valid_pred_new)*100,
                              "Bal Acc.": balanced_accuracy_score(y_val_, valid_pred_new)*100,
                              "Time":     total_time_},
                              ignore_index=True)
result_new_ord = result_new.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
result_new_ord.index += 1 
result_new_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')



Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,XGBClassifier,95.182292,95.386062,2.29711
2,LGBMClassifier,94.921875,94.959824,1.037382
3,RandomForestClassifier,94.335938,94.442748,0.704721
4,GradientBoostingClassifier,93.684896,93.825856,3.760938
5,KNeighborsClassifier,91.536458,91.761774,0.014168
6,DecisionTreeClassifier,90.364583,90.680303,0.04142
7,GaussianNB,67.578125,69.688606,0.016146
8,LogisticRegression,63.867188,66.205581,0.177928


#  The highest accuracy model selected  manual tuning 

In [30]:

best_=LGBMClassifier(boosting_type='gbdt', num_leaves=60, max_depth=6, learning_rate=0.07,
                          n_estimators=300, subsample_for_bin=200000, objective=None, class_weight=None,
                          min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, 
                          subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, 
                          random_state=None, n_jobs=- 1, silent=True, importance_type='split')
model_=Pipeline([('preprocess',preprocessor),('classification',best_)])

In [31]:
model_.fit(X_,y_)
pred_test_1=model_.predict(X_test_)

In [32]:
accuracy_score(y_test_, pred_test_1)*100,

(62.81464530892449,)

In [33]:
print(classification_report(y_test_, pred_test_1))
print()
print(confusion_matrix(y_test_, pred_test_1))

              precision    recall  f1-score   support

         Bus       0.38      0.24      0.30       164
       Still       0.38      0.89      0.54        76
       Train       0.40      0.97      0.56        86
     Walking       0.94      0.65      0.77       548

    accuracy                           0.63       874
   macro avg       0.52      0.69      0.54       874
weighted avg       0.73      0.63      0.64       874


[[ 40  24  77  23]
 [  2  68   5   1]
 [  3   0  83   0]
 [ 59  86  45 358]]


# Optuna find best parameter for selected model

In [34]:
from sklearn.calibration import CalibratedClassifierCV
import optuna

In [35]:
oth=Pipeline([('preprocess',preprocessor),('classification',GaussianNB())])

In [37]:
import optuna

def objective(trial):

    boosting_type = trial.suggest_categorical("boosting_type", ["gbdt", "rf"])
    max_depth = trial.suggest_int('max_depth', 1, 32)
    n_estimators = trial.suggest_int("n_estimators", 100,500)
    learning_rate=trial.suggest_float("learning_rate", 0.001,0.1)
    min_data_in_leaf= trial.suggest_int("min_data_in_leaf", 10,500)
    
    lg_lgbm = LGBMClassifier( boosting_type=boosting_type, max_depth=max_depth, 
            n_estimators=n_estimators,learning_rate=learning_rate,min_data_in_leaf=min_data_in_leaf
        )

    score = cross_val_score(lg_lgbm, X_,y_, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))


[32m[I 2021-05-19 19:12:42,533][0m A new study created in memory with name: no-name-23beef8b-5d32-4059-ba0c-0db580fde41c[0m
[32m[I 2021-05-19 19:12:46,079][0m Trial 0 finished with value: 0.9348771012509772 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'n_estimators': 404, 'learning_rate': 0.0921684692319195, 'min_data_in_leaf': 426}. Best is trial 0 with value: 0.9348771012509772.[0m
[32m[I 2021-05-19 19:12:49,555][0m Trial 1 finished with value: 0.9450374234427938 and parameters: {'boosting_type': 'gbdt', 'max_depth': 28, 'n_estimators': 494, 'learning_rate': 0.08797995436452907, 'min_data_in_leaf': 295}. Best is trial 1 with value: 0.9450374234427938.[0m
[33m[W 2021-05-19 19:12:49,645][0m Trial 2 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:12:49,768][0m Trial 3 failed, because the objective function returned nan.[0m
[32m[I 2021-05-19 19:12:51,443][0m Trial 4 finished with value: 0.9502473856528538 and parameters: {'boosti

[33m[W 2021-05-19 19:13:16,518][0m Trial 62 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:13:16,578][0m Trial 63 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:13:16,643][0m Trial 64 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:13:16,709][0m Trial 65 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:13:16,777][0m Trial 66 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:13:16,836][0m Trial 67 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:13:16,896][0m Trial 68 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:13:16,958][0m Trial 69 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:13:17,032][0m Trial 70 failed, because the objective function returned nan.[0m
[33m[W 2021-05-19 19:13:17,096][0m Trial 71 failed, because the objecti

Accuracy: 0.9549365145295804
Best hyperparameters: {'boosting_type': 'gbdt', 'max_depth': 9, 'n_estimators': 448, 'learning_rate': 0.07213909648670565, 'min_data_in_leaf': 12}


# With optuna best parameter

 the best accuracy  **91.18993135011442**  in max_depth= 5, n_estimators= 300, num_leaves=70,min_data_in_leaf=60,learning_rate= 0.069

In [38]:

lgb_af_=LGBMClassifier( max_depth= 5
, n_estimators= 300, num_leaves=70,min_data_in_leaf=60,learning_rate= 0.069)
model_end=Pipeline([('preprocess',preprocessor),('classification',lgb_af_)])

In [39]:
model_end.fit(X_,y_)
pred_test_end=model_end.predict(X_test_)



In [40]:
print(f"Accuracy {accuracy_score(y_test_, pred_test_end)*100}")
print(classification_report(y_test_, pred_test_end))
print()
print(confusion_matrix(y_test_, pred_test_end))

Accuracy 64.07322654462243
              precision    recall  f1-score   support

         Bus       0.41      0.21      0.28       164
       Still       0.39      0.91      0.55        76
       Train       0.38      0.99      0.55        86
     Walking       0.95      0.68      0.79       548

    accuracy                           0.64       874
   macro avg       0.53      0.70      0.54       874
weighted avg       0.75      0.64      0.65       874


[[ 35  26  86  17]
 [  2  69   4   1]
 [  1   0  85   0]
 [ 48  81  48 371]]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3e7089f3-11a5-48ad-89c4-39a166311a14' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>