In [27]:
import pandas as pd
import numpy as np
from scipy.stats import mode

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings("ignore")

In [28]:
# Laden van de dataset
dataset = pd.read_csv('./CasusData.csv')  # Vervang 'jouw_bestandsnaam.csv' door de werkelijke bestandsnaam

In [29]:
# Behandeling van ontbrekende waarden
imputer = SimpleImputer(strategy='mean')
dataset_imputed = pd.DataFrame(imputer.fit_transform(dataset), columns=dataset.columns)

print("Dataset na filtering en imputatie:")
print(dataset_imputed.head())

Dataset na filtering en imputatie:
   WAP001  WAP002  WAP003  WAP004  WAP005  WAP006  WAP007  WAP008  WAP009  \
0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   
1   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   
2   100.0   100.0   100.0   100.0   100.0   100.0   100.0   -97.0   100.0   
3   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   
4   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   

   WAP010  ...  WAP520  LONGITUDE      LATITUDE  FLOOR  BUILDINGID  SPACEID  \
0   100.0  ...   100.0 -7541.2643  4.864921e+06    2.0         1.0    106.0   
1   100.0  ...   100.0 -7536.6212  4.864934e+06    2.0         1.0    106.0   
2   100.0  ...   100.0 -7519.1524  4.864950e+06    2.0         1.0    103.0   
3   100.0  ...   100.0 -7524.5704  4.864934e+06    2.0         1.0    102.0   
4   100.0  ...   100.0 -7632.1436  4.864982e+06    0.0         0.0    122.0   

   RELATIVEPOSITION  USERID

In [30]:
# Feature scaling (normalisatie)
# Hier gebruiken we StandardScaler om de features te normaliseren.
scaler = StandardScaler()
features_scaled = scaler.fit_transform(dataset_imputed.iloc[:, :520])

In [31]:
# K-means clustering voor groepering van locaties
features_scaled = scaler.fit_transform(dataset_imputed.iloc[:, :520])
kmeans = KMeans(n_clusters=3, random_state=42)
dataset_imputed['LOCATION_CLUSTER'] = kmeans.fit_predict(features_scaled)

print("Dataset na clustering:")
print(dataset_imputed.head())

Dataset na clustering:
   WAP001  WAP002  WAP003  WAP004  WAP005  WAP006  WAP007  WAP008  WAP009  \
0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   
1   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   
2   100.0   100.0   100.0   100.0   100.0   100.0   100.0   -97.0   100.0   
3   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   
4   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   100.0   

   WAP010  ...  LONGITUDE      LATITUDE  FLOOR  BUILDINGID  SPACEID  \
0   100.0  ... -7541.2643  4.864921e+06    2.0         1.0    106.0   
1   100.0  ... -7536.6212  4.864934e+06    2.0         1.0    106.0   
2   100.0  ... -7519.1524  4.864950e+06    2.0         1.0    103.0   
3   100.0  ... -7524.5704  4.864934e+06    2.0         1.0    102.0   
4   100.0  ... -7632.1436  4.864982e+06    0.0         0.0    122.0   

   RELATIVEPOSITION  USERID  PHONEID     TIMESTAMP  LOCATION_CLUSTER  
0               

In [32]:
# Nieuwe kolom: WALKING_PATTERN (bijvoorbeeld op basis van locatieverandering)
dataset_imputed['WALKING_PATTERN'] = ((dataset_imputed['LATITUDE'].diff() != 0) | (dataset_imputed['LONGITUDE'].diff() != 0)).astype(int)

In [33]:
print(dataset_imputed.columns)

Index(['WAP001', 'WAP002', 'WAP003', 'WAP004', 'WAP005', 'WAP006', 'WAP007',
       'WAP008', 'WAP009', 'WAP010',
       ...
       'LATITUDE', 'FLOOR', 'BUILDINGID', 'SPACEID', 'RELATIVEPOSITION',
       'USERID', 'PHONEID', 'TIMESTAMP', 'LOCATION_CLUSTER',
       'WALKING_PATTERN'],
      dtype='object', length=531)


In [34]:
# Feature engineering voor wandelrichting op basis van patronen
dataset_imputed['WALKING_DIRECTION'] = dataset_imputed['LATITUDE'].diff().apply(lambda x: 1 if x > 0 else 0)

# Groeperen op individuele gebruikers
grouped_data = dataset_imputed.groupby('USERID').agg({
    'WALKING_DIRECTION': 'max',  # Aggregeer de wandelrichting (max waarde over tijd)
    'LOCATION_CLUSTER': 'max',  # Aggregeer de locatiecluster (max waarde over tijd)
    # Voeg andere gewenste aggregaties toe voor extra informatie
}).reset_index()

print("Groeperen op USERID:")
print(grouped_data.head())

Groeperen op USERID:
   USERID  WALKING_DIRECTION  LOCATION_CLUSTER
0     1.0                  1                 2
1     2.0                  1                 2
2     3.0                  1                 0
3     4.0                  1                 2
4     5.0                  1                 2


In [35]:
# Aangepaste functie om de modus te berekenen voor een SeriesGroupBy
def mode_groupby(series_groupby):
    # Gebruik scipy.stats.mode om de modus te berekenen
    modes = mode(series_groupby)[0]
    # Neem de eerste modus (als er meerdere zijn)
    return modes[0]

In [36]:
print(dataset_imputed.columns)

Index(['WAP001', 'WAP002', 'WAP003', 'WAP004', 'WAP005', 'WAP006', 'WAP007',
       'WAP008', 'WAP009', 'WAP010',
       ...
       'FLOOR', 'BUILDINGID', 'SPACEID', 'RELATIVEPOSITION', 'USERID',
       'PHONEID', 'TIMESTAMP', 'LOCATION_CLUSTER', 'WALKING_PATTERN',
       'WALKING_DIRECTION'],
      dtype='object', length=532)


In [37]:
# Groeperen op individuele gebruikers (opnieuw)
grouped_data = dataset_imputed.groupby('USERID').agg({
    'WALKING_PATTERN': mode_groupby,  # Gebruik aangepaste functie voor modus
    'LOCATION_CLUSTER': 'max'
}).reset_index()

In [38]:
# Verwijder eventuele NaN-waarden die zijn ontstaan na groeperen
grouped_data = grouped_data.dropna()

In [39]:
# Features en doelvariabele
features = grouped_data[['LOCATION_CLUSTER']]
target = grouped_data['WALKING_PATTERN']

In [40]:
# Split de data opnieuw in trainings- en testsets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [41]:
print(dataset_imputed['WALKING_PATTERN'].unique())


[1 0]


In [42]:
# Doelvariabele
target = dataset_imputed['WALKING_PATTERN']

# Features voor voorspelling (bijvoorbeeld locatiecluster)
features = dataset_imputed[['LOCATION_CLUSTER']]

# Split de data in trainings- en testsets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [43]:
# Train een Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [44]:
# Voorspel de WALKING_PATTERN op de testset
predictions = classifier.predict(X_test)

In [45]:
# Group by USERID
grouped_data = dataset_imputed.groupby('USERID').agg({
    'WALKING_PATTERN': 'max',
    'LOCATION_CLUSTER': 'max',
}).reset_index()

In [46]:
# Define features and target
target = grouped_data['WALKING_PATTERN']
features_grouped = grouped_data[['LOCATION_CLUSTER']]

In [47]:
unique_classes = np.unique(y_train)
print(unique_classes)


[0 1]


In [52]:
# Instantiate the logistic regression model with a different solver
classifier = LogisticRegression(solver='liblinear')  # You can try 'lbfgs', 'newton-cg', etc.

# Fit the model and make predictions
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [50]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_grouped, target, test_size=0.2, random_state=42)

# Train RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)

# Train DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)

# Train LogisticRegression
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train, y_train)
lr_predictions = lr_classifier.predict(X_test)

# Train Neural Network
nn_classifier = Sequential([
    Dense(64, activation='relu', input_dim=1),
    Dense(1, activation='sigmoid')
])
nn_classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_classifier.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
nn_predictions = (nn_classifier.predict(X_test) > 0.5).astype(int)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1