In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the training data
data = pd.read_csv('measures.csv', delimiter=';', encoding='utf-8')

# Preprocess the training data
for col in data.select_dtypes(include=['object']).columns:
    if col != 'activity':
        data[col] = data[col].str.replace(',', '.').astype(float)

print(data.shape)
# Split data into training and testing sets based on the subjects
train_subjects = [1.0, 3.0, 5.0, 6.0]
test_subjects = [27.0, 28.0, 29.0, 30.0, 31.0]

train_data = data[data['subject'].isin(train_subjects)]
test_data = data[data['subject'].isin(test_subjects)]

print(train_data.shape, test_data.shape)

# Extract features and labels
X_train = train_data.drop(columns=['subject', 'activity'])
y_train = train_data['activity']

X_test = test_data.drop(columns=['subject', 'activity'])
y_test = test_data['activity']

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=32)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

## Testing

# Add the predictions to the test data
test_data['predicted_activity'] = y_pred
print(f"Number of predicted columns: {test_data.shape[0]}")

# Calculate the number of correct predictions

correct_predictions = (test_data['activity'] == test_data['predicted_activity']).sum()
print(f"Number of correct predictions: {correct_predictions}")

(7352, 563)
(1315, 563) (1485, 563)
Accuracy: 92.79%
Number of predicted columns: 1485
Number of correct predictions: 1378


  test_data['predicted_activity'] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_activity'] = y_pred


In [38]:
# Load the new data
new_data = pd.read_csv('to_predict.csv', delimiter=';', encoding='utf-8')

# Preprocess the new data
for col in new_data.select_dtypes(include=['object']).columns:
    if col != 'activity':
        new_data[col] = new_data[col].str.replace(',', '.').astype(float)

# Extract features
X_new = new_data.drop(columns=['subject'])

# Standardize the features
X_new = scaler.transform(X_new)

# Make predictions
y_new_pred = model.predict(X_new)

# Add the predictions to the new data
new_data['predicted_activity'] = y_new_pred


# Count the number of each activity
activity_counts = data['activity'].value_counts()
print(activity_counts)

# Save the new data with the predicted activity
new_data.to_csv('predicted.csv', index=False)

  new_data['predicted_activity'] = y_new_pred


activity
LAYING                1407
STANDING              1374
SITTING               1286
WALKING               1226
WALKING_UPSTAIRS      1073
WALKING_DOWNSTAIRS     986
Name: count, dtype: int64
