# PAMAP2 DataSet - Classification Problem

#### About The Data Set:


The PAMAP2 Physical Activity Monitoring dataset contains data of 18 different physical activities (such as walking, cycling, playing soccer, etc.), performed by 9 subjects wearing 3 inertial measurement units and a heart rate monitor. 

## Imports 

In [189]:
# basic
import numpy as np
import pandas as pd
import csv
import nltk
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()



import seaborn as sns
from matplotlib import pyplot as plt
from IPython.display import display
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder

# split data
from sklearn.preprocessing import MinMaxScaler

## Load the data from the data set

In [190]:
#load_activity_map - get the map of humen acctivities describe in the data set

def load_activity_map():
    map = {}
    map[0] = 'transient'
    map[1] = 'lying'
    map[2] = 'sitting'
    map[3] = 'standing'
    map[4] = 'walking'
    map[5] = 'running'
    map[6] = 'cycling'
    map[7] = 'Nordic_walking'
    map[9] = 'watching_TV'
    map[10] = 'computer_work'
    map[11] = 'car driving'
    map[12] = 'ascending_stairs'
    map[13] = 'descending_stairs'
    map[16] = 'vacuum_cleaning'
    map[17] = 'ironing'
    map[18] = 'folding_laundry'
    map[19] = 'house_cleaning'
    map[20] = 'playing_soccer'
    map[24] = 'rope_jumping'
    return map

In [191]:
#generate_three_IMU - genetrate 3 columns of x, y, z data

def generate_three_IMU(name):
    x = name +'_x'
    y = name +'_y'
    z = name +'_z'
    return [x,y,z]

In [192]:
#generate_four_IMU - genetrate 4 columns of x, y, z, w data

def generate_four_IMU(name):
    x = name +'_x'
    y = name +'_y'
    z = name +'_z'
    w = name +'_w'
    return [x,y,z,w]

In [193]:
#generate_cols_IMU - for a given name generate 5 columns of the sensors as describe in the data set (accelmoter 16, 6, gyroscope, magnometer and orentaion

def generate_cols_IMU(name):
    # temp
    temp = name+'_temperature'
    output = [temp]
    # acceleration 16
    acceleration16 = name+'_3D_acceleration_16'
    acceleration16 = generate_three_IMU(acceleration16)
    output.extend(acceleration16)
    # acceleration 6
    acceleration6 = name+'_3D_acceleration_6'
    acceleration6 = generate_three_IMU(acceleration6)
    output.extend(acceleration6)
    # gyroscope
    gyroscope = name+'_3D_gyroscope'
    gyroscope = generate_three_IMU(gyroscope)
    output.extend(gyroscope)
    # magnometer
    magnometer = name+'_3D_magnetometer'
    magnometer = generate_three_IMU(magnometer)
    output.extend(magnometer)
    # oreintation
    oreintation = name+'_4D_orientation'
    oreintation = generate_four_IMU(oreintation)
    output.extend(oreintation)
    return output

In [194]:
#load_IMU - load all 52 columns of a subject in the data set

def load_IMU():
    output = ['time_stamp','activity_id', 'heart_rate']
    hand = 'hand'
    hand = generate_cols_IMU(hand)
    output.extend(hand)
    chest = 'chest'
    chest = generate_cols_IMU(chest)
    output.extend(chest)
    ankle = 'ankle'
    ankle = generate_cols_IMU(ankle)
    output.extend(ankle)
    return output

In [195]:
#load_subjects - load all subjects from a given root

def load_subjects(root='C:/Users/moria/Desktop/Project-PAMAP2/Data/Protocol/subject'):
    output = pd.DataFrame()
    cols = load_IMU()
    
    for i in range(101,110):
        path = root + str(i) +'.dat'
        subject = pd.read_table(path, header=None, sep='\s+')
        subject.columns = cols 
        subject['id'] = i
        output = output.append(subject, ignore_index=True)
    output.reset_index(drop=True, inplace=True)
    return output

In [None]:
data = load_subjects()

In [None]:
data

#### To optimize our data:

1. We will take every NaN value and eplace it with the mean value of the column.

2. We will note from the map that acitvity_id = 0 is not an valid activity.

In [None]:
def fix_data(data):
    
    # define activity_id = 0 is not a valid activity
    index_to_del = data[data['activity_id']==0].index
    data = data.drop(index_to_del)
    data = data.interpolate()
    
    # fill all the NaN values 
    for col_name in data.columns:
        mean_col = data[col_name].mean()
        data[col_name] = data[col_name].fillna(mean_col)
    activity_mean = data.groupby(['activity_id']).mean().reset_index()
    return data

In [None]:
data = fix_data(data)

In [None]:
data.describe()

### Split data to Train & Test

In [None]:
def split_train_test(data):
    
    # create the test data
    subject107 = data[data['id'] == 107]
    subject108 = data[data['id'] == 108]
    test = subject107.append(subject108)

    # create the train data
    train = data[data['id'] != 107]
    train = data[data['id'] != 108]

    # drop the columns id and time
    test = test.drop(["id"], axis=1)
    train = train.drop(["id"], axis=1)

    # split train and test to X and y
    X_train = train.drop(['activity_id','time_stamp'], axis=1).values
    X_test = test.drop(['activity_id','time_stamp'], axis=1).values
    
    # make data scale to min max beetwin 0 to 1
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    scaler.fit(X_test)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    y_train = train['activity_id'].values
    y_test = test['activity_id'].values
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = split_train_test(data)
print('Train shape X :',X_train.shape,' y ', y_train.shape)
print('Test shape X :',X_test.shape,' y ', y_test.shape)

### Naive solution

In [None]:
def base_line_preprocess(data):
    train = data.groupby('activity_id')
    X_base = train.mean().reset_index().drop(['activity_id','id','time_stamp'],axis=1).values
    min_max_scaler = MinMaxScaler()
    min_max_scaler.fit(X_base)
    X_base = min_max_scaler.transform(X_base)
    y_base = np.array(train['activity_id'].unique().explode().values).astype('float64')
    return X_base, y_base

In [None]:
X_base, y_base = base_line_preprocess(data)
print('X base shape: ', X_base.shape)
print('y base shape: ', y_base.shape)

# Model

In [None]:
features = X_test.shape[0]
labels = 18
learning_rate = 1/1000
training_epoch = 1000
batch_size = 5
displey_set = 1

### SoftMax

In [None]:
x = tf.placeholder(tf.float32, [None, features])
y = tf.placeholder(tf.float32, [None, labels])

# weight
w = tf.Variable(tf.zeros([features, labels]))

# Bias
b = tf.Variable(tf.zeros([labels]))

In [None]:
pred = tf.nn.softmax(tf.matmul(x, w) + b)

# loss function - minimize error
loss = -tf.reduce_mean(y*tf.log(pred))

update = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

### Train

In [None]:
def accuracy(pred, labels):
    correct_pred = np.sum(np.argmax(pred, 1) == np.argmax(labels, 1))
    acc = (100.0 * pred)/pred.shape[0]
    return acc

In [None]:
sess = tf.Session()

# init
init = tf.global_variables_initializer()
sess.run(init)

for i in range(training_epoch):
    sess.run(update, feed_dict = {x: X_test, y: y_test}) #BGD

In [None]:
with tf.Session() as sess:
    
    # run to init
    init = tf.global_variables_initializer()
    sess.run(init)
    
    # training
    for epoch in range(training_epoch):
        avg_cost = 0.0
        avg_acc = 0.0
        total_batch = len(X_train)//batch_size
        
        for i in range(total_batch):
            batch_x = X_train[i:i+1+batch_size]
            batch_y = X_train[i:i+1+batch_size]
            
            _, c = sess.run([update, loss], feed_dict={x: batch_x, y: batch_y})
            avg_cost += c/total_batch
            
            pred_y = sess.run(pred, feed_dict={x: batch_x})
            acc = accuracy(pred_y, batch_y)
            avg_acc += acc/total_batch
            
        if (epoch+1)%display_step==0:
            tc = sess.run(loss, feed_dict={x: X_test, y: y_test})
            pred_y = sess.run(loss, feed_dict={x: X_test})
            ta = accuracy(pred_y, y_test)
            
            print("Epoch: {:2.0f} - Cost: {:1.5f} - Acc: {:0.5f} - TestCost: {:0.5f} - TestAcc{:0.5f}".format(epoch+1, avg_cost,avg_acc, tc, ta))
            
        print("Optimization Finnished")
        
        # test model
        correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
        
        # calculate accuracy
        acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        print("Test Accuracy: ", acc.eval({x: X_test, y: y_test}))