In [6]:
from pathlib import Path
import os

STUDENT = 'mmr497'
DATA_PATH = Path('/local/data/mmr497')
OUTLIERS_PATH = Path('./outliers2/')
INTERMEDIATE_PATH = Path(f'{DATA_PATH}/intermediate_datafiles/')
os.chdir(f'/home/{STUDENT}/')
from util.VisualizeDataset import VisualizeDataset
from Visualiser import Visualiser as Viz
from util.util import ignore_actual_time, read_parquet, write_parquet
from FeatureCreator import FeatureCreatorUpdated
from DataLearningLoader import DataLearningLoader
import pandas as pd
from Chapter7.FeatureSelection import FeatureSelectionClassification
from Chapter7.LearningAlgorithms import ClassificationAlgorithms
from AlteredAlgorithms import AlteredAlgorithmsClassifier
from Chapter7.Evaluation import ClassificationEvaluation
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [7]:
# Testing parameters
feature_selection = True

In [8]:
EasyViz = Viz()
DataViz = VisualizeDataset('ML2.ipynb')

In [9]:
cleaned_dataset = read_parquet(INTERMEDIATE_PATH / 'ML4QS_imputed_results.parquet')

In [10]:
cleaned_dataset.isna().sum()

id                                        0
acc_phone_X                               0
acc_phone_Y                               0
acc_phone_Z                               0
lin_acc_phone_X                           0
lin_acc_phone_Y                           0
lin_acc_phone_Z                           0
gyr_phone_X                               0
gyr_phone_Y                               0
gyr_phone_Z                               0
location_phone_Latitude                   0
location_phone_Longitude                  0
location_phone_Height                  5754
location_phone_Velocity               14431
location_phone_Direction              15709
location_phone_Horizontal Accuracy        0
location_phone_Vertical Accuracy          0
mag_phone_X                               0
mag_phone_Y                               0
mag_phone_Z                               0
proximity_phone_Distance                  0
labeltram                                 0
labeltrain                      

In [13]:
test_df = read_parquet(INTERMEDIATE_PATH / 'non_fourier_features.parquet')
nan_counts = test_df.isna().sum()
nan_cols = nan_counts[nan_counts > 0]
print(non_cols)

location_phone_Latitude                      45012
location_phone_Longitude                     45012
location_phone_Height                        45090
location_phone_Velocity                      45345
location_phone_Direction                     45555
location_phone_Horizontal Accuracy           45012
location_phone_Vertical Accuracy             45012
mag_phone_X                                   1631
mag_phone_Y                                   1631
mag_phone_Z                                   1631
proximity_phone_Distance                     52950
location_phone_Velocity_mean                 20205
location_phone_Velocity_median               20205
location_phone_Velocity_min                  20205
location_phone_Velocity_max                  20205
location_phone_Direction_mean                20465
location_phone_Direction_median              20465
location_phone_Direction_min                 20465
location_phone_Direction_max                 20465
location_phone_Horizontal Accur

In [15]:
fcr = FeatureCreatorUpdated(INTERMEDIATE_PATH)
DLL = DataLearningLoader(df_path=INTERMEDIATE_PATH, output_dir=INTERMEDIATE_PATH, verbose=False)

In [16]:
feature_df = fcr.create_features(cleaned_dataset, overwrite=False)

Combined features already exist at /local/data/mmr497/intermediate_datafiles/combined_features.parquet
Loaded combined features from /local/data/mmr497/intermediate_datafiles/combined_features.parquet
If this was not intended, rerun create_features with overwrite=True


In [18]:
nan_counts = feature_df.isna().sum()
nan_cols = nan_counts[nan_counts > 0]
print(nan_cols)

location_phone_Height               5754
location_phone_Velocity            14431
location_phone_Direction           15709
location_phone_Velocity_mean       14431
location_phone_Velocity_median     14431
location_phone_Velocity_min        14431
location_phone_Velocity_max        14431
location_phone_Direction_mean      15709
location_phone_Direction_median    15709
location_phone_Direction_min       15709
location_phone_Direction_max       15709
session_avg_velocity               14431
session_velocity_std               14431
dtype: int64


In [24]:
# Stupid fix! Only temporary! MUST REMOVE
def remove_nan_cols(df):
    df_fs = df.copy()
    col_before = df_fs.columns
    nan_counts = df_fs.isna().sum()
    nan_cols = nan_counts[nan_counts > 0]
    df_fs = df_fs.drop(nan_cols.index, axis=1)
    col_after = df_fs.columns
    print('Removed columns: ', [col for col in col_before if col not in col_after])
    return df_fs

filtered_feature_df = remove_nan_cols(feature_df)

Removed columns:  ['location_phone_Height', 'location_phone_Velocity', 'location_phone_Direction', 'location_phone_Velocity_mean', 'location_phone_Velocity_median', 'location_phone_Velocity_min', 'location_phone_Velocity_max', 'location_phone_Direction_mean', 'location_phone_Direction_median', 'location_phone_Direction_min', 'location_phone_Direction_max', 'session_avg_velocity', 'session_velocity_std']


In [25]:
X_train, X_test, y_train, y_test = DLL.prepare_data(filtered_feature_df)

Cleaning data...
Creating labels...
Cleaning features...
Splitting data...
bike: Only 2 sessions - 1 train, 1 test


In [28]:
# Optional for testing

def remove_problem_cols(df):
    # This is for testing purposes, stupid fix!
    df_fs = df.copy()
    col_before = df_fs.columns
    df_fs = df_fs.select_dtypes(include=[np.number], exclude=['timedelta64[ns]'])
    col_after = df_fs.columns
    print('Removed columns: ', [col for col in col_before if col not in col_after])
    return df_fs

if feature_selection:
    fs = FeatureSelectionClassification()

    N_FORWARD_SELECTION = 50

    X_train_fs = remove_problem_cols(X_train)
    X_test_fs = remove_problem_cols(X_test)

    features, ordered_features, ordered_scores = fs.forward_selection(N_FORWARD_SELECTION,
                                                                      X_train_fs,
                                                                      X_test_fs,
                                                                      y_train,
                                                                      y_test,
                                                                      gridsearch=True)

    DataViz.plot_xy(x=[range(1, N_FORWARD_SELECTION+1)], y=[ordered_scores],
                xlabel='number of features', ylabel='accuracy')

Removed columns:  ['time_diff', 'shifted_time']
Removed columns:  ['time_diff', 'shifted_time']
Added feature0


KeyboardInterrupt: 

In [29]:
# Setup model
CA = ClassificationAlgorithms()
alt_CA = AlteredAlgorithmsClassifier()

X_train_fs = remove_problem_cols(X_train)
X_test_fs = remove_problem_cols(X_test)

# First MLP

mlp_train_y, mlp_test_y, mlp_train_prob_y, mlp_test_prob_y = alt_CA.feedforward_neural_network(
            X_train_fs, y_train, X_test_fs, gridsearch=True
)

# Train RF

rf_train_y, rf_test_y, rf_train_prob_y, rf_test_prob_y = alt_CA.random_forest(
    X_train_fs, y_train, X_test_fs, gridsearch=True
)


Removed columns:  ['time_diff', 'shifted_time']
Removed columns:  ['time_diff', 'shifted_time']



KeyboardInterrupt



In [None]:
# Evaluate models

CEVAL = ClassificationEvaluation()

CEVAL.f1(y_train, mlp_train_y)