# Machine Learning Methods

In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import argparse

## 1) Simple examples

### 1. KNN
<font size=2>

Code reference:
    
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
    
</font>

In [2]:
X = np.array([[0,0,0], [0,0,1], [0,1,0],
              [20,20,20], [19,19,19],[20,21,18]])
y = np.array([0, 0, 0,
              1, 1, 1])
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X, y)

In [3]:
print(neigh.predict([[9,9,9],[13,13,13],[18,18,18]]))
print(neigh.predict_proba([[9,9,9],[13,13,13],[18,18,18]]))

[0 1 1]
[[1. 0.]
 [0. 1.]
 [0. 1.]]


### 2. Random Forest
<font size=2>

Random Forest is a popular machine learning algorithm used for classification tasks. It is an ensemble learning method that combines the predictions of multiple decision trees to make more accurate predictions.

1. Building the forest:
    
The algorithm starts by creating an ensemble of decision trees. The number of trees is a user-defined parameter.

2. Random feature selection:
    
At each node of every decision tree, a random subset of features is considered for splitting. This helps to introduce randomness and reduce the correlation between trees.

3. Growing decision trees:
    
Each decision tree is grown by recursively partitioning the data based on the selected features. The splitting is done based on certain criteria, typically using measures like Gini impurity or entropy, to find the best feature and split point that maximizes the separation of classes.

4. Voting for predictions:
    
Once all the decision trees are constructed, predictions are made by each tree individually. For classification, the class label that receives the majority of votes from the trees is chosen as the final prediction.

<br>
    
Random Forest has several **advantages**:

It is robust against overfitting because the randomness introduced during feature selection and tree construction helps to reduce variance.
It can handle large datasets with a high number of features.
It provides estimates of feature importance, allowing for insights into the relative significance of different features in the classification task.
    
<br>
    
Code reference:
    
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    
</font>

In [4]:
X = np.array([[0,0,0], [0,0,1], [0,1,0],
              [20,20,20], [19,19,19],[20,21,18]])
y = np.array([0, 0, 0,
              1, 1, 1])
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)

In [5]:
print(clf.predict([[9,9,9],[10,10,10],[13,13,13]]))
print(clf.predict_proba([[9,9,9],[10,10,10],[13,13,13]]))

[0 0 1]
[[0.98 0.02]
 [0.52 0.48]
 [0.   1.  ]]


### 3. SVM
<font size=2>

Code reference:
    
https://scikit-learn.org/stable/modules/svm.html
    
</font>

In [6]:
X = np.array([[0,0,0], [0,0,1], [0,1,0],
              [20,20,20], [19,19,19],[20,21,18]])
y = np.array([0, 0, 0,
              1, 1, 1])
from sklearn import svm
clf = svm.SVC()
clf.fit(X, y)

In [7]:
print(clf.predict([[9,9,9],[10,10,10],[13,13,13]]))
# print(clf.predict_proba([[9,9,9],[10,10,10],[13,13,13]]))

[0 1 1]


## 2) Sample from Captury Live

In [8]:
class Classification():
    
    def __init__(self,Dataset_Path,Train_Len,Test_Len):
        self.get_data(dataset_path=Dataset_Path,train_len=Train_Len,test_len=Test_Len)

    def get_data(self,dataset_path,train_len=10000,test_len=200):
        # load data
        x_data_path, y_data_path = dataset_path[0], dataset_path[1]
        with open(x_data_path, 'rb') as xf:
            x_data = np.load(xf)
        with open(y_data_path, 'rb') as yf:
            y_data = np.load(yf)
        # define length for trainset and testset
        self.Train_Len = train_len
        self.Test_Len = test_len
        # randomly select train samples
        choices_train = np.random.randint(x_data.shape[0], size = self.Train_Len)
        self.x_train = x_data[choices_train]
        self.y_train = y_data[choices_train]
        # delete train samples for test samples
        new_x_data = np.delete(x_data, choices_train, axis=0)
        new_y_data = np.delete(y_data, choices_train, axis=0)
        print(f'x_train shape: {self.x_train.shape}')
        print(f'y_train shape: {self.y_train.shape}')
        # randomly select test samples
        choices_test = np.random.randint(new_x_data.shape[0], size = self.Test_Len)
        self.x_test = new_x_data[choices_test]
        self.y_test = new_y_data[choices_test]
        print(f'x_test shape: {self.x_test.shape}')
        print(f'y_test shape: {self.y_test.shape}')
        print()
        
class KNN(Classification):
    
    def __init__(self,N_neighbor,Dataset_Path,Train_Len,Test_Len):
        super(KNN, self).__init__(Dataset_Path,Train_Len,Test_Len)
        self.neigh = KNeighborsClassifier(n_neighbors=N_neighbor)
        
    def train(self):
        self.neigh.fit(self.x_train, self.y_train)
        
    def test(self):
        self.P_pred = self.neigh.predict_proba(self.x_test)
        self.T_pred = self.neigh.predict(self.x_test)
        
class RandomForest(Classification):
    
    def __init__(self,Max_Depth,Random_State,Dataset_Path,Train_Len,Test_Len):
        super(RandomForest, self).__init__(Dataset_Path,Train_Len,Test_Len)
        self.random_forest = RandomForestClassifier(max_depth=Max_Depth, random_state=Random_State)
        
    def train(self):
        self.random_forest.fit(self.x_train, self.y_train)
        
    def test(self):
        self.P_pred = self.random_forest.predict_proba(self.x_test)
        self.T_pred = self.random_forest.predict(self.x_test)
        
class SVM(Classification):
    
    def __init__(self,Dataset_Path,Train_Len,Test_Len):
        super(SVM, self).__init__(Dataset_Path,Train_Len,Test_Len)
        self.svm = svm.SVC()
        
    def train(self):
        self.svm.fit(self.x_train, self.y_train)
        
    def test(self):
        self.T_pred = self.svm.predict(self.x_test)

In [77]:
if __name__ == '__main__':
    ###### get parameters ######
    parser = argparse.ArgumentParser(description='Machine learning method on classification of human activities from skeleton data')
    # general arguments
    parser.add_argument('--dataset_path', type=list, default=['x_data_UpperLowerBody.npy','y_data_UpperLowerBody.npy'], help='path of dataset', 
                        choices=[
                            ['x_data_UpperLowerBody.npy','y_data_UpperLowerBody.npy'],
                            ['x_data_UpperBody.npy','y_data_UpperBody.npy']
                        ])
    parser.add_argument('--model', type=str, default='SVM', choices=['KNN','RandomForest','SVM'])
    parser.add_argument('--train_len', type=int, default=10, help='length of train set')
    parser.add_argument('--test_len', type=int, default=100, help='length of test set')
    # for KNN
    parser.add_argument('--n_neighbor', type=int, default=1, help='number of neighbours, only for KNN')
    # for RandomForest
    parser.add_argument('--max_depth', type=int, default=2, help='max depth for random forest')
    parser.add_argument('--random_state', type=int, default=0, help='random state for random forest')
    
    args = parser.parse_args([])
    
    if args.model == 'KNN':
        cls_model = KNN(N_neighbor=args.n_neighbor,
                        Dataset_Path=args.dataset_path,
                        Train_Len=args.train_len,Test_Len=args.test_len)
        
    elif args.model == 'RandomForest':
        cls_model = RandomForest(Max_Depth=args.max_depth,Random_State=args.random_state,
                                 Dataset_Path=args.dataset_path,
                                 Train_Len=args.train_len,Test_Len=args.test_len)
        
    elif args.model == 'SVM':
        cls_model = SVM(Dataset_Path=args.dataset_path,
                              Train_Len=args.train_len,Test_Len=args.test_len)
        
    cls_model.train()
    cls_model.test()
    print(f'Result on {args.model}:')
    print(f'predicted target: {cls_model.T_pred}')
    if args.model == 'KNN' or args.model == 'RandomForest':
        print(f'probability of predicted target: {cls_model.P_pred}')
    print(f'true target: {cls_model.y_test}')
    print(f'Accuracy = {np.sum(cls_model.T_pred == cls_model.y_test) / len(cls_model.T_pred)}')
    print(f'Result: {cls_model.T_pred == cls_model.y_test}')

x_train shape: (10, 50)
y_train shape: (10,)
x_test shape: (100, 50)
y_test shape: (100,)

Result on SVM:
predicted target: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
true target: [2 4 2 3 4 3 4 1 5 5 5 5 4 4 3 5 2 5 2 5 1 2 5 3 4 1 3 1 1 2 1 5 4 2 3 4 2
 5 3 3 3 2 1 1 4 3 3 5 3 4 4 3 5 4 4 5 3 5 4 1 5 4 5 5 4 3 2 5 3 1 2 2 1 2
 1 3 4 2 2 3 1 5 1 5 2 2 5 4 3 1 3 2 4 3 3 3 1 5 2 1]
Accuracy = 0.19
Result: [ True False  True False False False False False False False False False
 False False False False  True False  True False False  True False False
 False False False False False  True False False False  True False False
  True False False False False  True False False False False False False
 False False False False False False False False False False False False
 False False False False False False  True False False False  True