In [1]:
import numpy as np
import pandas as pd
import time 
import sklearn
from sklearn import pipeline  ,ensemble 
from sklearn.pipeline import Pipeline 
from sklearn import impute
from sklearn import compose
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix,roc_auc_score
from sklearn import set_config
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import Normalizer,PowerTransformer,QuantileTransformer, RobustScaler,StandardScaler,LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.tree          import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression    
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
# from catboost              import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
set_config(display='diagram') # Useful for display the pipeline
print("Pandas  ", pd.__version__)
print("Sklearn ", sklearn.__version__) # Try to use 0.24



Pandas   1.2.4
Sklearn  0.24.2


In [4]:
data=pd.read_csv('pca_dataset.csv')

In [5]:
data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,target
0,-1.873084,1.610167,-1.300423,-2.529103,0.607226,0.28336,Still
1,0.426116,-1.246284,-1.366538,0.882866,-0.061855,-1.273449,Car
2,-1.355057,-0.639898,-0.71617,-1.412142,0.263994,-0.002448,Still
3,-0.048081,-1.339383,-1.248335,1.04499,0.076059,-0.758765,Car
4,-0.691588,1.253835,-1.020233,-0.6336,-0.093897,0.55604,Car


In [8]:
X=data.drop('target',axis=1)
y=data.target
num_features=X.columns

In [16]:
num_4_Models = pipeline.Pipeline(steps=[
    ('imputer',impute.SimpleImputer(strategy='mean', fill_value='missing')),
    ('scalar',StandardScaler()),
])


preprocessor = compose.ColumnTransformer(transformers=[
    ('num', num_4_Models, num_features),
    # ('cat', cat_4_Models, cat_vars),
], remainder='passthrough') # Drop other vars not specified in num_vars or cat_vars
preprocessor

In [17]:
classifier_models = {
"DecisionTreeClassifier": DecisionTreeClassifier(),
'KNeighborsClassifier':KNeighborsClassifier(6),
'LogisticRegression':LogisticRegression(),
"RandomForestClassifier":ensemble.RandomForestClassifier(),
"AdaBoostClassifier":ensemble.AdaBoostClassifier(),
"GradientBoostingClassifier":ensemble.GradientBoostingClassifier(),
"XGBClassifier":XGBClassifier(),
"LGBMClassifier":LGBMClassifier(),
# "CatBoostClassifier":CatBoostClassifier(),

}
# make pipline with  preprocessing 
classifier_models = {name: pipeline.make_pipeline(preprocessor, model) for name, model in classifier_models.items()}
classifier_models["GradientBoostingClassifier"]

In [18]:

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,stratify = y,random_state=10 )
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [],'Time': []})
for model_name, model in classifier_models.items():
    start_time = time.time()
    model.fit(X_train, y_train)
    total_time = time.time() - start_time
    valid_pred = model.predict(X_test)
    results = results.append({"Model":    model_name,
                              "Accuracy": accuracy_score(y_test, valid_pred)*100,
                              "Bal Acc.": balanced_accuracy_score(y_test, valid_pred)*100,
                    
                              "Time":     total_time},
                              ignore_index=True)
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')



Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,RandomForestClassifier,91.178965,91.181753,0.939646
2,XGBClassifier,90.670059,90.672196,3.917173
3,LGBMClassifier,90.585242,90.586008,0.800398
4,KNeighborsClassifier,88.719254,88.717634,0.01239
5,GradientBoostingClassifier,86.259542,86.26181,5.330233
6,DecisionTreeClassifier,85.411366,85.41255,0.048832
7,AdaBoostClassifier,65.394402,65.404616,0.513449
8,LogisticRegression,62.765055,62.78471,0.119329


In [20]:
from sklearn.metrics import classification_report,confusion_matrix
#with the best  paramaters
classifier=ensemble.RandomForestClassifier()
tuning_model=Pipeline(steps=[('preprocessor', preprocessor),('classifier',classifier )])
tuning_model.fit(X_train,y_train)
pred=tuning_model.predict(X_test)
print("model score: %.3f" % tuning_model.score(X_test, y_test))
print(classification_report(y_test, pred))
print()
print(confusion_matrix(y_test, pred))

model score: 0.911
              precision    recall  f1-score   support

         Bus       0.90      0.88      0.89       236
         Car       0.89      0.89      0.89       236
       Still       0.93      0.94      0.93       236
       Train       0.91      0.91      0.91       236
     Walking       0.92      0.93      0.93       235

    accuracy                           0.91      1179
   macro avg       0.91      0.91      0.91      1179
weighted avg       0.91      0.91      0.91      1179


[[208  12   2   4  10]
 [ 12 210   5   6   3]
 [  0   3 222  10   1]
 [  5   6   6 215   4]
 [  7   5   4   0 219]]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3e7089f3-11a5-48ad-89c4-39a166311a14' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>