In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
from pandas.core.frame import DataFrame
from sklearn.model_selection import train_test_split
import random

In [2]:
df = pd.read_csv('the_whole_data.csv') #read the previous dataframe

In [3]:
df = df[['mean_hori', 'mean_ver', 'median_hori', 'median_ver', 'state']]

In [4]:
df #check the whole data which will be used in classification

Unnamed: 0,mean_hori,mean_ver,median_hori,median_ver,state
0,0.006363,0.001671,0.003,0.014,0
1,-0.009000,0.006691,-0.013,0.012,0
2,-0.006220,-0.008295,-0.011,0.005,0
3,-0.005819,-0.001751,0.002,0.009,0
4,-0.002016,0.006631,-0.009,0.002,0
...,...,...,...,...,...
13020,-0.039199,0.003162,0.008,-0.008,2
13021,-0.029962,-0.002062,0.022,-0.023,2
13022,-0.056482,0.050883,-0.036,-0.032,2
13023,0.013096,0.028659,0.059,-0.050,2


In [5]:
df['state'].value_counts() #check the distribution of data

1    9116
0    1959
2    1950
Name: state, dtype: int64

In [6]:
df.loc[df['state'] == 0, 'state'] = 1 #let label 0 be equal to label 1
df['state'].value_counts()

In [8]:
df['state'] = df['state'].values - 1 #transform the label(1,2) to label(0,1), that will be suitable for classifying.

In [9]:
df #check the final dataframe

Unnamed: 0,mean_hori,mean_ver,median_hori,median_ver,state
0,0.006363,0.001671,0.003,0.014,0
1,-0.009000,0.006691,-0.013,0.012,0
2,-0.006220,-0.008295,-0.011,0.005,0
3,-0.005819,-0.001751,0.002,0.009,0
4,-0.002016,0.006631,-0.009,0.002,0
...,...,...,...,...,...
13020,-0.039199,0.003162,0.008,-0.008,1
13021,-0.029962,-0.002062,0.022,-0.023,1
13022,-0.056482,0.050883,-0.036,-0.032,1
13023,0.013096,0.028659,0.059,-0.050,1


In [10]:
X = df[['mean_hori', 'mean_ver', 'median_hori', 'median_ver']]  #define x as features, y as label
Y = df['state']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2, shuffle = True) 
#split the data into two set, one is for training, other is for validation or testing.

In [12]:
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import learning_curve
from sklearn.svm import SVC  
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, BatchNormalization,Dropout
from tensorflow.keras.models import Model, Sequential

In [13]:
def hyperparameter_svm(parameters):  #use gridsearch to find the optimal hyperparameters of svm model
    svc = svm.SVC()
    model = GridSearchCV(svc,parameters,cv=5,scoring='accuracy')
    model.fit(X_train,y_train)
    test_score = model.score(X_test,y_test)
    train_score = model.score(X_train,y_train)
    print('train_score', train_score, '\n', 'test_score', test_score)
    return model.best_params_

In [15]:
parameters={'kernel':['rbf','sigmoid','linear']} # find optimal hyperparameter 'kernel'
best_params_svm = hyperparameter_svm(parameters)

train_score 0.860172744721689 
 test_score 0.8694817658349329


In [16]:
parameters={'kernel':[best_params_svm['kernel']], 'gamma':np.linspace(1,100,10)} #find optimal 'gamma'
best_params_svm = hyperparameter_svm(parameters)

train_score 0.8585412667946257 
 test_score 0.8690978886756238


In [17]:
parameters={'kernel':[best_params_svm['kernel']], 'gamma':[best_params_svm['gamma']], 'C':np.linspace(0.1,20,10)}
#find optimal 'C'
best_params_svm = hyperparameter_svm(parameters)

train_score 0.8589251439539347 
 test_score 0.8690978886756238


In [18]:
def svm_scores(best_params):  #print the result(accuracy, recall, precision,f1score) of the optimized model
    C = best_params['C']
    gamma = best_params['gamma']
    kernel = best_params['kernel']
    clf = svm.SVC(C = C, gamma = gamma, kernel = kernel)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    F1 = metrics.f1_score(y_test, y_pred)
    print("accuracy:",  acc, '\n', "precision:", precision, '\n', "recall:", recall, '\n', "F1 :",  F1)

In [19]:
svm_scores(best_params_svm)

accuracy: 0.8690978886756238 
 precision: 0.8292682926829268 
 recall: 0.09239130434782608 
 F1 : 0.16625916870415647


In [20]:
def hyperparameter_rf(parameters):  #use gridsearch to find the optimal hyperparameters of rf model
    rfc = RandomForestClassifier()
    model1 = GridSearchCV(rfc,parameters,cv = 5, scoring='accuracy')
    model1.fit(X_train,y_train)
    test_score = model1.score(X_test,y_test)
    train_score = model1.score(X_train,y_train)
    print('train_score', train_score, '\n', 'test_score', test_score)
    return model1.best_params_

In [21]:
parameters  = {'n_estimators': range(1,101,10)}  #find optimal 'n_estimators', but there exists overfitting
best_params_rf = hyperparameter_rf(parameters)

train_score 0.9999040307101728 
 test_score 0.8679462571976967


In [22]:
parameters  = {'n_estimators': [best_params_rf['n_estimators']], 'min_samples_leaf': range(1,51,10)}
#because of overfitting, optimize 'min_samples_leaf' to reduce overfitting.
best_params_rf = hyperparameter_rf(parameters)

train_score 0.8603646833013435 
 test_score 0.8694817658349329


In [23]:
def rfc_scores(best_params):  #print the result(accuracy, recall, precision,f1score) of the optimized model
    n_estimators = best_params['n_estimators']
    min_samples_leaf = best_params['min_samples_leaf']
    rfc = RandomForestClassifier(n_estimators = n_estimators,min_samples_leaf = min_samples_leaf )
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    F1 = metrics.f1_score(y_test, y_pred)
    print("accuracy:",  acc, '\n', "precision:", precision, '\n', "recall:", recall, '\n', "F1 :",  F1)

In [24]:
rfc_scores(best_params_rf)

accuracy: 0.8706333973128599 
 precision: 0.8297872340425532 
 recall: 0.10597826086956522 
 F1 : 0.18795180722891566


In [25]:
def hyperparameter_gbc(parameters):   #use gridsearch to find the optimal hyperparameters of gbc model
    gbc = GradientBoostingClassifier()
    model = GridSearchCV(gbc,parameters,cv=5,scoring='accuracy')
    model.fit(X_train,y_train)
    test_score = model.score(X_test,y_test)
    train_score = model.score(X_train,y_train)
    print('train_score', train_score, '\n', 'test_score', test_score)
    return model.best_params_

In [26]:
parameters = {'learning_rate':[0.01, 0.1], 'n_estimators': range(1,101,10)} #find optimal 'learning_rate' and 'n_estimators'
best_params_gbc = hyperparameter_gbc(parameters)

train_score 0.8629558541266794 
 test_score 0.8714011516314779


In [27]:
parameters = {'learning_rate':[best_params_gbc['learning_rate']],  #find optimal 'min_samples_leaf'
              'n_estimators': [best_params_gbc['n_estimators']],
              'min_samples_leaf': range(1,51,10)}
best_params_gbc = hyperparameter_gbc(parameters)

train_score 0.863147792706334 
 test_score 0.8694817658349329


In [34]:
def gbc_scores(best_params):  #print the result(accuracy, recall, precision,f1score) of the optimized model
    n_estimators = best_params['n_estimators']
    learning_rate = best_params['learning_rate']
    min_samples_leaf = best_params['min_samples_leaf']
    gbc = GradientBoostingClassifier(n_estimators = n_estimators, learning_rate = learning_rate, 
                                     min_samples_leaf = min_samples_leaf)
    gbc.fit(X_train, y_train)
    y_pred = gbc.predict(X_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    F1 = metrics.f1_score(y_test, y_pred)
    print("accuracy:",  acc, '\n', "precision:", precision, '\n', "recall:", recall, '\n', "F1 :",  F1)

In [35]:
gbc_scores(best_params_gbc)

accuracy: 0.8694817658349329 
 precision: 0.75 
 recall: 0.11413043478260869 
 F1 : 0.1981132075471698


In [30]:
def nn_model(X_train, y_train, X_test, y_test):  #define the structure of nn model using keras model
    input_shape = X_train.shape[1]
    input = Input(shape = (input_shape, ))
    x = Dense(1024, activation = 'relu')(input)
    x = Dense(512, activation = 'relu')(x)
    x = Dense(256, activation = 'relu')(x)
    x = Dense(128, activation = 'relu')(x)
    x = Dense(64, activation = 'relu')(x)
    x = Dense(32, activation = 'relu')(x)
    x = Dense(16, activation = 'relu')(x)
    x = Dense(8, activation = 'tanh')(x)
    output = Dense(2, activation = 'softmax')(x)
    
    nn = Model(input,output)
    nn.compile(optimizer = tf.keras.optimizers.Adam(lr = 5e-4), 
               loss= 'sparse_categorical_crossentropy', metrics = ['acc'])
    nn.fit(X_train, y_train, epochs = 30, batch_size = 24, shuffle =True,validation_data = (X_test, y_test))
    return nn

In [31]:
nn = nn_model(X_train, y_train, X_test, y_test)

Metal device set to: Apple M1 Pro


2023-01-30 22:10:42.651480: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-30 22:10:42.651603: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  super().__init__(name, **kwargs)


Epoch 1/30


2023-01-30 22:10:42.909456: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-30 22:10:43.189240: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-01-30 22:10:49.266040: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [32]:
def nn_scores(nn):  #print the result(accuracy, recall, precision,f1score) of the optimized model
    y_pred = nn.predict(X_test)
    y_pred = np.argmax(y_pred, axis = 1)
    acc = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    F1 = metrics.f1_score(y_test, y_pred)
    print("accuracy:",  acc, '\n', "precision:", precision, '\n', "recall:", recall, '\n', "F1 :",  F1)

In [33]:
nn_scores(nn)



2023-01-30 22:12:50.014173: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


accuracy: 0.8698656429942418 
 precision: 0.6933333333333334 
 recall: 0.14130434782608695 
 F1 : 0.23476297968397292


Nachdem wir die Hyperparameter abgestimmt hatten, waren alle Modelle hat keine overfitting und gaben accuracy, recall, precision und f1_score aus, aber ein erhebliches Problem ist, dass die Testgenauigkeit bei etwa 87 % liegt,  schwer zu erhöhen ist und f1_score auf einem sehr niedrigen Niveau war, was wird durch zu geringen Recall verursacht, da die Daten extrem unausgeglichen sind (label_0:11000, label_1:2000).