In [1]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import joblib
import optuna
from random import shuffle
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.pipeline import Pipeline ,make_pipeline
from sklearn import impute, compose,set_config
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score, plot_confusion_matrix,roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import MinMaxScaler,Normalizer,PowerTransformer,QuantileTransformer, RobustScaler,StandardScaler,LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.tree          import DecisionTreeClassifier
from sklearn.linear_model  import LogisticRegression    
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import sklearn
set_config(display='diagram') # Useful for display the pipeline
print("Pandas  ", pd.__version__)
print("Sklearn ", sklearn.__version__) # Try to use 0.24



Pandas   1.2.4
Sklearn  0.24.2


In [None]:
data = pd.read_csv('./data/data_5secondWindow .csv')
data.head(5)

In [None]:
data.describe()

In [None]:
data.groupby('target').size()

In [None]:
data=data[data['target']!='Bus']
data=data[data['target']!='Train']

In [None]:
def change_name(df):
    column_names=[]
    for i in df.columns:
        k=i.replace('.','_').replace("#",'_')
        df.rename(columns = {i:k}, inplace = True)
    return df
data=change_name(data)

In [None]:
# create test data from some user 
def split_data(df):
    l=df.user.unique().tolist()
#     # create random list of indi
#     shuffle(l)
    train_user=l[2:]
    test_user=l[:2]
    # get splitting indicies
#     train=data[data['user'].isin(train_user)]
#     test = data[data['user'].isin(test_user)]
    train=df[df['user']!='IvanHeibi']
    test=df[df['user']=='IvanHeibi'] 
    return train, test

In [None]:
df_train,df_test=split_data(data)


In [None]:
def clean_data(df_train,df_test):
    unimp    = ['id','user','time']
    df_train = df_train.drop(unimp,axis=1)
    df_test  = df_test.drop(unimp,axis=1)
    train_missing = (df_train.isnull().sum() / len(df_train)).sort_values(ascending = False)
    test_missing = (df_test.isnull().sum() / len(df_test)).sort_values(ascending = False)

    train_missing = train_missing.index[train_missing > 0.75]
    test_missing = test_missing.index[test_missing > 0.75]

    all_missing = list(set(set(train_missing) | set(test_missing)))
    df_train = df_train.drop(all_missing,axis=1)
    df_test  = df_test.drop(all_missing,axis=1)
    return df_train,df_test

In [None]:
 df_train,df_test=clean_data(df_train,df_test)

In [None]:
d1=df_test[df_test['target']=='Still']
d2=df_test[df_test['target']=='Car']
d3=df_test[df_test['target']=='Walking']
d1_=d1.sample(5)
d2_=d2.sample(5)
d3_=d3.sample(5)
sample=pd.concat([d1_,d2_,d3])
# df_test.to_csv('sample1.csv',index=False)

In [None]:
df_test.to_csv('sample1.csv',index=False)

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
X=df_train.drop('target',axis=1)
y=df_train.target
X_test=df_test.drop('target',axis=1)
y_test=df_test.target

# Create pipeline

In [None]:
num_var=X.columns.to_list()
num_pip=Pipeline([('imputer',impute.SimpleImputer(strategy='median')),
                  ('scalar',MinMaxScaler(feature_range=(0,20))),
#                  ('PCA' ,PCA(n_components=20)),
               ]) 
preprocessor=compose.ColumnTransformer([
    ('num',num_pip,num_var)
], remainder='drop')
preprocessor

In [None]:
models = {
"DecisionTreeClassifier": DecisionTreeClassifier(),
'KNeighborsClassifier':KNeighborsClassifier(6),
'LogisticRegression':LogisticRegression(),
"RandomForestClassifier":RandomForestClassifier(),
"GradientBoostingClassifier":GradientBoostingClassifier(),
"XGBClassifier":XGBClassifier(),
"LGBMClassifier":LGBMClassifier(),
'GaussianNB':GaussianNB(),

}
# make pipline with  preprocessing 
classifiers = {name:make_pipeline(preprocessor, model) for name, model in models.items()}
classifiers["LGBMClassifier"]

# Check model 

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, y,test_size=0.2,stratify = y,random_state=10 )
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [],'Time': []})
for  name ,model in classifiers.items():
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
    valid_pred = model.predict(x_val)
    results = results.append({"Model":    name,
                              "Accuracy": accuracy_score(y_val, valid_pred)*100,
                              "Bal Acc.": balanced_accuracy_score(y_val, valid_pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
    
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')


# best model  with optuna parameters

In [None]:
best_model=LGBMClassifier(max_depth= 5,n_estimators= 270, num_leaves=20,min_data_in_leaf=80,learning_rate= 0.069)
name='LGBMClassifier'
clf=Pipeline([('pre',preprocessor),('classification',best_model)])
test_result = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [],'Time': []})
start_time = time.time()
clf.fit(X, y)
total_time = time.time() - start_time
pred = clf.predict(X_test)
test_result = test_result.append({"Model":    name,
                          "Accuracy": accuracy_score(y_test, pred)*100,
                          "Bal Acc.": balanced_accuracy_score(y_test, pred)*100,
                          "Time":     total_time},
                          ignore_index=True)
test_result_ord = test_result.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
test_result_ord.index += 1 
test_result_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')


In [None]:
print(classification_report(y_test, pred))
print()
print(confusion_matrix(y_test, pred))

# Optuna find best parameter for selected model

In [None]:
def objective(trial):

    boosting_type = trial.suggest_categorical("boosting_type", ["gbdt", "rf"])
    max_depth = trial.suggest_int('max_depth', 1, 32)
    n_estimators = trial.suggest_int("n_estimators", 100,500)
    learning_rate=trial.suggest_float("learning_rate", 0.001,0.1)
    min_data_in_leaf= trial.suggest_int("min_data_in_leaf", 10,500)
    
    lg_lgbm = LGBMClassifier( boosting_type=boosting_type, max_depth=max_depth, 
            n_estimators=n_estimators,learning_rate=learning_rate,min_data_in_leaf=min_data_in_leaf
        )

    score = cross_val_score(lg_lgbm, X,y, n_jobs=-1, cv=2)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

# Save model and preprocessor 

In [38]:
best_model.fit(X_,y)
joblib.dump(best_model, 'model1.x')
joblib.dump(preprocessor, 'preprocessor1.x')



['preprocessor1.x']