# Create Pipeline

In [27]:
import numpy as np
import pandas as pd
import time 
import sklearn
from sklearn import pipeline  ,ensemble 
from sklearn.pipeline import Pipeline 
from sklearn import impute
from sklearn import compose
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix,roc_auc_score
from sklearn import set_config
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import Normalizer,PowerTransformer,QuantileTransformer, RobustScaler,StandardScaler,LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.tree          import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression    
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
# from catboost              import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
set_config(display='diagram') # Useful for display the pipeline
print("Pandas  ", pd.__version__)
print("Sklearn ", sklearn.__version__) # Try to use 0.24



Pandas   1.2.4
Sklearn  0.24.2


In [43]:
df=pd.read_csv('data_selected_feature.csv')

In [44]:
df.isna().sum()

id                                                  0
user                                                0
android_sensor_gyroscope_mean                     720
android_sensor_accelerometer_std                    0
android_sensor_gyroscope_std                      721
android_sensor_gyroscope_uncalibrated_mean        799
android_sensor_accelerometer_max                    0
android_sensor_linear_acceleration_mean           494
speed_mean                                          0
android_sensor_rotation_vector_mean               721
android_sensor_rotation_vector_max                721
android_sensor_accelerometer_min                    0
android_sensor_magnetic_field_uncalibrated_min    799
sound_min                                         994
target                                              0
dtype: int64

In [35]:
len(df)

5893

In [34]:
#drop bus, train from the data because we are only classifying the rest for now



In [42]:
df.shape

(5893, 13)

In [21]:
test=df.iloc[]
X=test.drop('target',axis=1)
y=test.target


In [32]:
y

0         Still
1           Car
2         Still
3           Car
4           Car
         ...   
5888    Walking
5889      Train
5890      Still
5891      Still
5892    Walking
Name: target, Length: 5893, dtype: object

In [23]:
num_features=X.columns

In [24]:
num_4_Models = pipeline.Pipeline(steps=[
    ('imputer',impute.SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('scalar',StandardScaler()),
    ('Normalizer',RobustScaler())
])


preprocessor = compose.ColumnTransformer(transformers=[
    ('num', num_4_Models, num_features),
    # ('cat', cat_4_Models, cat_vars),
], remainder='passthrough') # Drop other vars not specified in num_vars or cat_vars
preprocessor


In [25]:
classifier_models = {
"DecisionTreeClassifier": DecisionTreeClassifier(),
'KNeighborsClassifier':KNeighborsClassifier(6),
'LogisticRegression':LogisticRegression(),
"RandomForestClassifier":ensemble.RandomForestClassifier(),
"AdaBoostClassifier":ensemble.AdaBoostClassifier(),
"GradientBoostingClassifier":ensemble.GradientBoostingClassifier(),
"XGBClassifier":XGBClassifier(),
"LGBMClassifier":LGBMClassifier(),
# "CatBoostClassifier":CatBoostClassifier(),

}
# make pipline with  preprocessing 
classifier_models = {name: pipeline.make_pipeline(preprocessor, model) for name, model in classifier_models.items()}
classifier_models["GradientBoostingClassifier"]

In [30]:
# # x=df_train_m.drop('Survived',axis=1)
# # y=df_train_m['Survived']
# x=imputed_x
# y=df_train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,stratify = y,random_state=10 )
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [],'Time': []})
for model_name, model in classifier_models.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
    valid_pred = model.predict(x_val)
    results = results.append({"Model":    model_name,
                              "Accuracy": accuracy_score(y_val, valid_pred)*100,
                              "Bal Acc.": balanced_accuracy_score(y_val, valid_pred)*100,
                    
                              "Time":     total_time},
                              ignore_index=True)
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')



Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,DecisionTreeClassifier,100.0,100.0,0.032273
2,RandomForestClassifier,100.0,100.0,0.775448
3,GradientBoostingClassifier,100.0,100.0,7.218642
4,XGBClassifier,100.0,100.0,1.076977
5,LGBMClassifier,100.0,100.0,0.885108
6,LogisticRegression,97.540288,97.532997,0.500671
7,KNeighborsClassifier,96.352841,96.350523,0.024224
8,AdaBoostClassifier,80.067854,80.0,0.649859


In [31]:
#confusion matrix

from sklearn.metrics import classification_report,confusion_matrix
#with the best  paramaters
classifier=DecisionTreeClassifier()
tuning_model=Pipeline(steps=[('preprocessor', preprocessor),('classifier',classifier )])
tuning_model.fit(X_train,y_train)
pred=tuning_model.predict(X_test)
print("model score: %.3f" % tuning_model.score(X_test, y_test))
print(classification_report(y_test, pred))
print()
print(confusion_matrix(y_test, pred))

model score: 1.000
              precision    recall  f1-score   support

         Bus       1.00      1.00      1.00       236
         Car       1.00      1.00      1.00       236
       Still       1.00      1.00      1.00       236
       Train       1.00      1.00      1.00       236
     Walking       1.00      1.00      1.00       235

    accuracy                           1.00      1179
   macro avg       1.00      1.00      1.00      1179
weighted avg       1.00      1.00      1.00      1179


[[236   0   0   0   0]
 [  0 236   0   0   0]
 [  0   0 236   0   0]
 [  0   0   0 236   0]
 [  0   0   0   0 235]]


In [40]:

from sklearn.model_selection import cross_val_score
print(cross_val_score(tuning_model, X, y, cv=5))

[1. 1. 1. 1. 1.]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3e7089f3-11a5-48ad-89c4-39a166311a14' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>