In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
import random
import math
from sklearn.neural_network import MLPClassifier

In [2]:
# df_patients = pd.read_csv('Data Alpha Bravo Charlie Delta Echo - raw cleaned.csv', sep=';')  # cleaned data
df_patients = pd.read_csv('Data Alpha Bravo Charlie Delta Echo - supercleaned.csv', sep=';')  # super cleaned data
df_patients['Bias'] = 0

In [3]:
def SplitData(df, xcol, ycol, percentage):
    allIndex = np.unique(df['pat'].tolist()) # Get all unique patients id's
    random.seed(2) # Set random seeed so the answer is the same
    
    PercentageIndex = []
    for i in np.unique(np.floor(allIndex / 1000)): # Cycle through each categorie (only cycles through the ones that are present)
        CatPatients = allIndex[allIndex<((i+1)*1000)] # Filters out patients who are bigger then our max
        CatPatients = CatPatients[CatPatients>=((i)*1000)] # Filters out patients that are smaller
        
        AmountItems = len(CatPatients)
        AmountRandom = math.floor(AmountItems*percentage)
        PercentageIndex.extend(random.sample(list(CatPatients), AmountRandom))        
        
    AmountItems = len(allIndex)
    AmountRandom = math.floor(AmountItems*percentage)
    
    PercentageIndex = random.sample(list(allIndex), AmountRandom)
    
    # Normalize data
    xcoldf = df[xcol]
#     xcoldf = normalize(xcoldf)
    
    Percentagedf = xcoldf[df['pat'].isin(PercentageIndex)]
    Percentagey = df[ycol][df['pat'].isin(PercentageIndex)]
    
    Testdf = xcoldf[~df['pat'].isin(PercentageIndex)]
    Testy = df[ycol][~df['pat'].isin(PercentageIndex)]
    
    testpatlist = df['pat'][~df['pat'].isin(PercentageIndex)]
    return (Percentagedf, Testdf, Percentagey, Testy, testpatlist)

In [4]:
# Get all columns except three as X
columns = list(df_patients.columns)
# columns.remove('Unnamed: 0')
columns.remove('cat')
columns.remove('pat')
param = columns

Xcolumns = []
Xcolumns.extend(param)

X = df_patients[Xcolumns]
y = df_patients[['cat']]

# Split training set
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

X_train, X_test, y_train, y_test, tmptestpatlist = SplitData(df_patients, Xcolumns, 'cat', 0.8)

In [5]:
# Learn model
lr = LogisticRegression(random_state=21)
# lr = MLPClassifier(random_state=21,tol=0.00001,learning_rate='adaptive',verbose=10)
lr.fit(X_train, y_train)

#Predict test set
y_pred = lr.predict(X_test)



In [6]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test, y_pred, digits=3))
print(accuracy_score(y_test, y_pred, normalize=False, sample_weight=None))

              precision    recall  f1-score   support

           1      1.000     0.891     0.942       128
           2      0.940     1.000     0.969      2016
           3      0.745     1.000     0.854      1064
           4      1.000     0.793     0.884      2304

   micro avg      0.911     0.911     0.911      5512
   macro avg      0.921     0.921     0.912      5512
weighted avg      0.929     0.911     0.911      5512

5019


In [7]:
tmpdf = pd.DataFrame()
tmpdf['pat'] = tmptestpatlist
tmpdf['pred'] = y_pred
tmpdf.groupby('pat')['pred'].mean()

pat
1014    1.000000
1016    1.437500
1020    1.000000
1029    1.000000
2003    2.000000
2012    2.000000
2017    2.000000
2020    2.125000
2027    2.000000
2028    2.000000
2037    2.000000
2038    2.000000
3011    3.000000
3013    3.000000
3014    3.000000
3016    3.000000
3029    3.000000
3032    3.000000
3033    3.000000
4012    3.524414
4024    3.917187
Name: pred, dtype: float64

In [8]:
for num, i in enumerate(lr.coef_):
    print(num)
    ParamCheck = pd.DataFrame()
    ParamCheck['Params'] = Xcolumns
    ParamCheck['Theta'] = list(i)
    ParamCheck['Absolute'] = np.absolute(list(ParamCheck['Theta']))
    ParamCheck = ParamCheck.sort_values(by=['Absolute'], ascending=False)
    print(ParamCheck.head())

0
                  Params     Theta  Absolute
0             Unnamed: 0 -0.013271  0.013271
263     Charlie_EnergyYL -0.004250  0.004250
389       Delta_EnergyZR -0.003396  0.003396
195  Bravo_humerus_r_y_2  0.003309  0.003309
7         Alpha_EnergyYL -0.003139  0.003139
1
                  Params     Theta  Absolute
404  Delta_humerus_r_z_0  0.051867  0.051867
416  Delta_humerus_l_z_0  0.048303  0.048303
452  Delta_humerus_r_z_2 -0.045593  0.045593
464  Delta_humerus_l_z_2 -0.044193  0.044193
488  Delta_humerus_l_z_3 -0.031663  0.031663
2
                  Params     Theta  Absolute
452  Delta_humerus_r_z_2  0.042820  0.042820
416  Delta_humerus_l_z_0 -0.042365  0.042365
464  Delta_humerus_l_z_2  0.031846  0.031846
404  Delta_humerus_r_z_0 -0.030082  0.030082
438  Delta_humerus_l_x_1  0.029019  0.029019
3
                    Params     Theta  Absolute
67     Alpha_humerus_r_y_2 -0.007754  0.007754
195    Bravo_humerus_r_y_2 -0.007538  0.007538
274  Charlie_humerus_r_x_0  0.006622  0.0

# Test with testset

In [11]:
df_testset = pd.read_csv('Data Alpha Bravo Charlie Delta Echo - Super cleaned testset.csv', sep=';')  # super cleaned testset

testset_X = df_patients[Xcolumns]
testset_y = df_patients[['cat']]

y_pred_testset = lr.predict(testset_X)

print(classification_report(testset_y, y_pred_testset, digits=3))

              precision    recall  f1-score   support

           1      1.000     0.983     0.992       832
           2      0.993     1.000     0.996     17676
           3      0.965     1.000     0.982      9956
           4      1.000     0.977     0.989     21046

   micro avg      0.990     0.990     0.990     49510
   macro avg      0.989     0.990     0.990     49510
weighted avg      0.990     0.990     0.990     49510

