In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import math, random
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
def SplitData(df, xcol, ycol, percentage):
    allIndex = np.unique(df['pat'].tolist()) # Get all unique patients id's
    random.seed(2) # Set random seeed so the answer is the same
    
    PercentageIndex = []
    for i in np.unique(np.floor(allIndex / 1000)): # Cycle through each categorie (only cycles through the ones that are present)
        CatPatients = allIndex[allIndex<((i+1)*1000)] # Filters out patients who are bigger then our max
        CatPatients = CatPatients[CatPatients>=((i)*1000)] # Filters out patients that are smaller
        
        AmountItems = len(CatPatients)
        AmountRandom = math.floor(AmountItems*percentage)
        PercentageIndex.extend(random.sample(list(CatPatients), AmountRandom))        
        
    AmountItems = len(allIndex)
    AmountRandom = math.floor(AmountItems*percentage)
    
    PercentageIndex = random.sample(list(allIndex), AmountRandom)
    
    Percentagedf = df[xcol][df['pat'].isin(PercentageIndex)]
    Percentagey = df[ycol][df['pat'].isin(PercentageIndex)]
    
    Testdf = df[xcol][~df['pat'].isin(PercentageIndex)]
    Testy = df[ycol][~df['pat'].isin(PercentageIndex)]
    
    return (Percentagedf, Testdf, Percentagey, Testy)



In [3]:
df_cleaned = pd.read_csv('../Supercleaned_alpha.csv',  sep= ';')

df_cleaned['clavicula_x_dif'] = np.absolute(df_cleaned['clavicula_l_x'] - df_cleaned['clavicula_r_x'])
df_cleaned['clavicula_y_dif'] = np.absolute(df_cleaned['clavicula_l_y'] - df_cleaned['clavicula_r_y'])
df_cleaned['clavicula_z_dif'] = np.absolute(df_cleaned['clavicula_l_z'] - df_cleaned['clavicula_r_z'])

df_cleaned['scapula_x_dif'] = np.absolute(df_cleaned['scapula_l_x'] - df_cleaned['scapula_r_x'])
df_cleaned['scapula_y_dif'] = np.absolute(df_cleaned['scapula_l_y'] - df_cleaned['scapula_r_y'])
df_cleaned['scapula_z_dif'] = np.absolute(df_cleaned['scapula_l_z'] - df_cleaned['scapula_r_z'])

df_cleaned['humerus_x_dif'] = np.absolute(df_cleaned['humerus_l_x'] - df_cleaned['humerus_r_x'])
df_cleaned['humerus_y_dif'] = np.absolute(df_cleaned['humerus_l_y'] - df_cleaned['humerus_r_y'])
df_cleaned['humerus_z_dif'] = np.absolute(df_cleaned['humerus_l_z'] - df_cleaned['humerus_r_z'])


param = [ \
          'humerus_l_x', 'humerus_l_y', 'humerus_l_z', 'humerus_r_x', 'humerus_r_y', 'humerus_r_z', \
          'clavicula_l_x', 'clavicula_l_y', 'clavicula_l_z', 'clavicula_r_x', 'clavicula_r_y', 'clavicula_r_z', \
          'scapula_l_x', 'scapula_l_y', 'scapula_l_z', 'scapula_r_x', 'scapula_r_y', 'scapula_r_z', \
          'clavicula_x_dif','clavicula_y_dif','clavicula_z_dif', \
          'scapula_x_dif','scapula_y_dif','scapula_z_dif', \
          'humerus_x_dif', 'humerus_y_dif', 'humerus_z_dif'
         ]
#param = ['clavicula_x_dif','clavicula_y_dif']

df_cleaned['bias'] = 1

# split oorsprong kolom in onderdelen
x,y = df_cleaned['Oorsprong'].str.split(".").str
df_cleaned['cat'],df_cleaned['pat'],df_cleaned['meting'],df_cleaned['oef'],df_cleaned['split'] = x.str.split("_").str
df_cleaned['cat'] = [ int(x[3:]) for x in df_cleaned['cat']] 
df_cleaned['meting'] = [ int(x[6:]) for x in df_cleaned['meting']] 
df_cleaned['oef'] = [ int(x[3:]) for x in df_cleaned['oef']] 
df_cleaned['pat'] = [ int(x[3:]) for x in df_cleaned['pat']] 
df_cleaned['split'] = [ int(x[5:]) for x in df_cleaned['split']] 
# geef elke patient een uniek nummer
df_cleaned['pat'] = df_cleaned['cat']*1000+df_cleaned['pat']

df_cleaned['c4'] = ['Cat4' in vincent for vincent in df_cleaned['Oorsprong']]
df_cleaned['c3'] = ['Cat3' in vincent for vincent in df_cleaned['Oorsprong']]
df_cleaned['c2'] = ['Cat2' in vincent for vincent in df_cleaned['Oorsprong']]
df_cleaned['c1'] = ['Cat1' in vincent for vincent in df_cleaned['Oorsprong']]

In [4]:
df_cleaned

Unnamed: 0.1,Unnamed: 0,thorax_r_x,thorax_r_y,thorax_r_z,clavicula_r_x,clavicula_r_y,clavicula_r_z,scapula_r_x,scapula_r_y,scapula_r_z,...,bias,cat,pat,meting,oef,split,c4,c3,c2,c1
0,0,-1.670244,5.160307,3.399796,-24.761791,4.793458,0.136570,22.033994,11.738020,-8.107169,...,1,4,4023,1,2,1,True,False,False,False
1,1,-1.625348,5.298733,3.495786,-25.025731,4.589693,0.751133,22.139938,10.988492,-7.151301,...,1,4,4023,1,2,1,True,False,False,False
2,2,-1.626983,5.009631,3.303653,-27.166588,5.247723,-1.242757,20.752557,10.558282,-7.879249,...,1,4,4023,1,2,1,True,False,False,False
3,3,-0.798010,2.231620,2.114770,-30.140047,8.996802,2.477904,19.401640,14.281254,-8.895588,...,1,4,4023,1,2,1,True,False,False,False
4,4,0.867083,-0.768741,0.623773,-31.655351,12.895365,4.353386,19.204324,19.348325,-10.242779,...,1,4,4023,1,2,1,True,False,False,False
5,5,2.920107,-4.430999,-1.689196,-32.607549,16.471632,9.751704,20.614240,24.774745,-10.936538,...,1,4,4023,1,2,1,True,False,False,False
6,6,4.959135,-7.345564,-2.984804,-32.852461,18.265495,13.480280,22.304939,30.070429,-9.749052,...,1,4,4023,1,2,1,True,False,False,False
7,7,7.403613,-8.677631,-3.412571,-32.342528,20.480134,17.080758,23.426613,33.715433,-9.103281,...,1,4,4023,1,2,1,True,False,False,False
8,8,9.148620,-9.237103,-4.517333,-32.260772,22.703448,19.759501,24.086373,36.149158,-9.028078,...,1,4,4023,1,2,1,True,False,False,False
9,9,9.947461,-9.433538,-5.078960,-33.000658,23.330957,23.146260,24.258375,38.084418,-8.019844,...,1,4,4023,1,2,1,True,False,False,False


In [10]:
from sklearn.model_selection import train_test_split

Xcolumns = ['bias']
Xcolumns.extend(['thorax_r_x', 'thorax_r_y', 'thorax_r_z', 'clavicula_r_x', 'clavicula_r_y',
 'clavicula_r_z', 'scapula_r_x', 'scapula_r_y', 'scapula_r_z', 'humerus_r_x',
 'humerus_r_y', 'humerus_r_z', 'ellebooghoek_r', 'thorax_l_x',
 'thorax_l_y', 'thorax_l_z', 'clavicula_l_x', 'clavicula_l_y', 'clavicula_l_z',
 'scapula_l_x', 'scapula_l_y', 'scapula_l_z', 'humerus_l_x', 'humerus_l_y',
 'humerus_l_z', 'ellebooghoek_l'])


# Split training set
X_train, X_test, y_train, y_test = SplitData(df_cleaned, Xcolumns, 'c4', 0.8)

# Learn model
lr = LogisticRegression()
lr.fit(X_train, y_train)

#Predict test set
y_pred = lr.predict(X_test)




In [8]:
from sklearn.metrics import mean_squared_error, \
recall_score, precision_score, accuracy_score, confusion_matrix

y_true = y_test
# y_pred = Data['c1']['Model'].predict(Data['c1']['X_test'])

TN, FP, FN, TP = confusion_matrix(y_true, y_pred).ravel()

# tabel printen

print('Informationi matrix')
tab = [["pred pos", TP, FP], ["pred neg", FN, TN]]
print(pd.DataFrame(tab, columns=["", "pos", "neg"]))
print()
print("recall: ", recall_score(y_true, y_pred))
print("precision: ", precision_score(y_true, y_pred))
print("accuracy: ", accuracy_score(y_true, y_pred))
print('\n\n')

Informationi matrix
              pos   neg
0  pred pos   629   391
1  pred neg  4197  2262

recall:  0.13033568172399504
precision:  0.6166666666666667
accuracy:  0.3865490038775237



