In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [110]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import skew, kurtosis
from scipy.signal import find_peaks, welch
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from scipy.fftpack import fft

In [2]:
data_path = "../data"

In [70]:
imu_df=pd.read_csv(f'{data_path}/imu.csv')

In [71]:
imu_df.head()

Unnamed: 0,participant,body_part,condition,task,file_path,PacketCounter,SampleTimeFine,Quat_W,Quat_X,Quat_Y,...,Acc_X,Acc_Y,Acc_Z,Gyr_X,Gyr_Y,Gyr_Z,Mag_X,Mag_Y,Mag_Z,Unnamed: 15
0,P_02,arm_l,natural,Cupplacing,/Users/daikexin/pythonProject/CompensatoryMove...,0,1970-01-01 21:07:25,-0.73621,0.183717,0.638716,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.262207,0.231201,-0.031738,
1,P_02,arm_l,natural,Cupplacing,/Users/daikexin/pythonProject/CompensatoryMove...,1,1970-01-02 01:45:12,-0.736222,0.18372,0.638712,...,9.708877,-1.067807,1.129502,-1.429176,0.690589,0.529434,-0.262695,0.232666,-0.03125,
2,P_02,arm_l,natural,Cupplacing,/Users/daikexin/pythonProject/CompensatoryMove...,2,1970-01-02 06:22:59,-0.73617,0.183847,0.638716,...,9.700046,-1.093796,1.111337,-1.696598,0.339913,0.39384,-0.263672,0.232178,-0.03125,
3,P_02,arm_l,natural,Cupplacing,/Users/daikexin/pythonProject/CompensatoryMove...,3,1970-01-02 11:00:46,-0.736107,0.18397,0.638727,...,9.719351,-1.097965,1.145534,-1.721801,0.29121,0.197749,-0.262207,0.233643,-0.032959,
4,P_02,arm_l,natural,Cupplacing,/Users/daikexin/pythonProject/CompensatoryMove...,4,1970-01-02 15:38:33,-0.736076,0.183974,0.638764,...,9.745931,-1.099311,1.12124,-1.2776,0.308383,0.286817,-0.260254,0.232666,-0.030273,


In [72]:
data=imu_df.drop(columns=["file_path", "SampleTimeFine", "Unnamed: 15"])

In [73]:
data.columns

Index(['participant', 'body_part', 'condition', 'task', 'PacketCounter',
       'Quat_W', 'Quat_X', 'Quat_Y', 'Quat_Z', 'Acc_X', 'Acc_Y', 'Acc_Z',
       'Gyr_X', 'Gyr_Y', 'Gyr_Z', 'Mag_X', 'Mag_Y', 'Mag_Z'],
      dtype='object')

Feature extraction

In [74]:
# Group by relevant columns and find the max PacketCounter for each group
max_packet_counters = data.groupby(['participant', 'body_part', 'task', 'condition'])['PacketCounter'].max().reset_index()
max_packet_counters

Unnamed: 0,participant,body_part,task,condition,PacketCounter
0,P_02,arm_l,Cupplacing,comp,3180
1,P_02,arm_l,Cupplacing,comp_WE,3184
2,P_02,arm_l,Cupplacing,natural,3412
3,P_02,arm_l,Pin,comp,3068
4,P_02,arm_l,Pin,comp_WE,2723
...,...,...,...,...,...
295,P_06,wrist_r,Pouring,comp_WE,2047
296,P_06,wrist_r,Pouring,natural,2695
297,P_06,wrist_r,Wiping,comp,1714
298,P_06,wrist_r,Wiping,comp_WE,1244


In [69]:
# Define a function to compute features for a group
def compute_features(group):
    features = {}
    
    # Time-domain features
    for col in ['Quat_W', 'Quat_X', 'Quat_Y', 'Quat_Z', 
                'Acc_X', 'Acc_Y', 'Acc_Z', 
                'Gyr_X', 'Gyr_Y', 'Gyr_Z', 
                'Mag_X', 'Mag_Y', 'Mag_Z']:
        signal = group[col]
        features[f'{col}_mean'] = np.mean(signal)
        features[f'{col}_var'] = np.var(signal)
        features[f'{col}_rms'] = np.sqrt(np.mean(signal**2))
        features[f'{col}_range'] = np.max(signal) - np.min(signal)
        features[f'{col}_skew'] = skew(signal)
        features[f'{col}_kurt'] = kurtosis(signal)
        features[f'{col}_max'] = np.max(signal)

    # Frequency-domain features (Welch's method for power spectral density)
    for col in ['Acc_X', 'Acc_Y', 'Acc_Z', 'Gyr_X', 'Gyr_Y', 'Gyr_Z']:
        freqs, psd = welch(group[col], fs=50)  # Adjust fs to your sampling rate
        features[f'{col}_spectral_energy'] = np.sum(psd)
        features[f'{col}_dominant_freq'] = freqs[np.argmax(psd)]

    # Signal Vector Magnitude (SVM) for acceleration
    svm = np.sqrt(group['Acc_X']**2 + group['Acc_Y']**2 + group['Acc_Z']**2)
    features['Acc_SVM_mean'] = np.mean(svm)
    features['Acc_SVM_var'] = np.var(svm)

    return pd.Series(features)

In [78]:
# Group the data for feature extraction
grouped = data.groupby(['participant', 'body_part', 'task', 'condition'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x3023ba690>

In [79]:
# Compute features
features_df = grouped.apply(compute_features).reset_index()

  features_df = grouped.apply(compute_features).reset_index()


In [81]:
# Save the computed features to a new CSV
features_df.to_csv(f'{data_path}/computed_features.csv', index=False)

Use One-hot to encode the category columns (participant, body_part, task,condition)

In [None]:
features_df=pd.read_csv(f'{data_path}/computed_features.csv')

In [82]:
features_df.head()

Unnamed: 0,participant,body_part,task,condition,Quat_W_mean,Quat_W_var,Quat_W_rms,Quat_W_range,Quat_W_skew,Quat_W_kurt,...,Acc_Z_spectral_energy,Acc_Z_dominant_freq,Gyr_X_spectral_energy,Gyr_X_dominant_freq,Gyr_Y_spectral_energy,Gyr_Y_dominant_freq,Gyr_Z_spectral_energy,Gyr_Z_dominant_freq,Acc_SVM_mean,Acc_SVM_var
0,P_02,arm_l,Cupplacing,comp,-0.767555,0.000292,0.767746,0.085431,-1.085372,1.017358,...,0.785015,0.195312,1280.490882,4.101562,217.777279,0.390625,396.71187,0.390625,9.912421,0.096901
1,P_02,arm_l,Cupplacing,comp_WE,-0.77457,0.0002,0.774699,0.08187,-0.455353,0.589303,...,0.922279,0.195312,1100.379045,4.101562,189.954955,0.390625,537.814808,0.390625,9.912909,0.065498
2,P_02,arm_l,Cupplacing,natural,-0.742027,0.000155,0.742132,0.093685,-2.249484,5.730606,...,0.487908,0.195312,662.211986,1.953125,202.915105,1.953125,85.52484,3.90625,9.895929,0.080696
3,P_02,arm_l,Pin,comp,-0.759761,0.000324,0.759974,0.088499,-1.873579,2.1585,...,0.516621,2.148438,1140.060029,4.296875,133.199605,2.148438,81.489395,0.195312,9.906819,0.091274
4,P_02,arm_l,Pin,comp_WE,-0.734621,0.000446,0.734924,0.089529,-1.572033,1.22208,...,0.798055,2.148438,1889.241905,2.148438,213.897176,2.148438,100.79774,0.390625,9.894033,0.086365


In [83]:
# Initialize LabelEncoder for the target column
target_encoder = LabelEncoder()

# Encode the target column
features_df['condition'] = target_encoder.fit_transform(features_df['condition'])

for the condition column: nature is 2, comp is 0, comp_WE is 1

In [85]:
# Initialize OneHotEncoder for categorical columns
categorical_columns = ['participant', 'body_part', 'task']
encoder = OneHotEncoder(sparse_output=False)  

In [86]:
# Apply one-hot encoding to categorical columns
encoded_data = encoder.fit_transform(features_df[categorical_columns])

In [87]:
# Create a DataFrame for the encoded categorical data
encoded_columns = encoder.get_feature_names_out(categorical_columns)
encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns)

In [90]:
# Drop the original categorical columns and concatenate the encoded data
data_encoded = pd.concat([encoded_df, features_df.drop(columns=categorical_columns)], axis=1)

In [91]:
data_encoded

Unnamed: 0,participant_P_02,participant_P_03,participant_P_04,participant_P_05,participant_P_06,body_part_arm_l,body_part_arm_r,body_part_trunk,body_part_wrist_l,body_part_wrist_r,...,Acc_Z_spectral_energy,Acc_Z_dominant_freq,Gyr_X_spectral_energy,Gyr_X_dominant_freq,Gyr_Y_spectral_energy,Gyr_Y_dominant_freq,Gyr_Z_spectral_energy,Gyr_Z_dominant_freq,Acc_SVM_mean,Acc_SVM_var
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.785015,0.195312,1280.490882,4.101562,217.777279,0.390625,396.711870,0.390625,9.912421,0.096901
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.922279,0.195312,1100.379045,4.101562,189.954955,0.390625,537.814808,0.390625,9.912909,0.065498
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.487908,0.195312,662.211986,1.953125,202.915105,1.953125,85.524840,3.906250,9.895929,0.080696
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.516621,2.148438,1140.060029,4.296875,133.199605,2.148438,81.489395,0.195312,9.906819,0.091274
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.798055,2.148438,1889.241905,2.148438,213.897176,2.148438,100.797740,0.390625,9.894033,0.086365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,12.222144,0.195312,14474.448752,0.195312,957.582700,0.781250,1070.050430,0.195312,9.773867,0.305658
296,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,3.660535,0.195312,8002.691085,0.195312,1161.843794,0.781250,1335.891493,0.195312,9.774712,0.373336
297,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.513398,0.195312,258.480403,0.195312,208.133562,0.390625,170.040254,0.390625,9.746223,0.185695
298,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.895019,0.390625,625.158303,0.976562,178.686917,0.781250,520.712408,0.195312,9.765918,0.228332


In [92]:
data_encoded['condition'].value_counts()

condition
0    100
1    100
2    100
Name: count, dtype: int64

In [93]:
# export the imu dataframe
data_encoded.to_csv(f'{data_path}/data_encoded.csv', index=False)

Use machine learning model for prediction

In [189]:
data=pd.read_csv(f'{data_path}/data_encoded.csv')

In [190]:
# Drop columns containing "participant", "body_part", or "task"
columns_to_drop = data.filter(regex='participant|body_part|task').columns
data = data.drop(columns=columns_to_drop)

In [191]:
data

Unnamed: 0,condition,Quat_W_mean,Quat_W_var,Quat_W_rms,Quat_W_range,Quat_W_skew,Quat_W_kurt,Quat_W_max,Quat_X_mean,Quat_X_var,...,Acc_Z_spectral_energy,Acc_Z_dominant_freq,Gyr_X_spectral_energy,Gyr_X_dominant_freq,Gyr_Y_spectral_energy,Gyr_Y_dominant_freq,Gyr_Z_spectral_energy,Gyr_Z_dominant_freq,Acc_SVM_mean,Acc_SVM_var
0,0,-0.767555,0.000292,0.767746,0.085431,-1.085372,1.017358,-0.739256,0.137589,0.001380,...,0.785015,0.195312,1280.490882,4.101562,217.777279,0.390625,396.711870,0.390625,9.912421,0.096901
1,1,-0.774570,0.000200,0.774699,0.081870,-0.455353,0.589303,-0.742787,0.101660,0.001259,...,0.922279,0.195312,1100.379045,4.101562,189.954955,0.390625,537.814808,0.390625,9.912909,0.065498
2,2,-0.742027,0.000155,0.742132,0.093685,-2.249484,5.730606,-0.719047,0.171880,0.000701,...,0.487908,0.195312,662.211986,1.953125,202.915105,1.953125,85.524840,3.906250,9.895929,0.080696
3,0,-0.759761,0.000324,0.759974,0.088499,-1.873579,2.158500,-0.738504,0.108002,0.000697,...,0.516621,2.148438,1140.060029,4.296875,133.199605,2.148438,81.489395,0.195312,9.906819,0.091274
4,1,-0.734621,0.000446,0.734924,0.089529,-1.572033,1.222080,-0.706721,0.094506,0.000972,...,0.798055,2.148438,1889.241905,2.148438,213.897176,2.148438,100.797740,0.390625,9.894033,0.086365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1,0.924497,0.002197,0.925685,0.153218,-0.146561,-1.480618,0.982134,-0.211007,0.023073,...,12.222144,0.195312,14474.448752,0.195312,957.582700,0.781250,1070.050430,0.195312,9.773867,0.305658
296,2,-0.968711,0.000539,0.968989,0.191876,1.650751,6.201473,-0.800820,0.144593,0.016010,...,3.660535,0.195312,8002.691085,0.195312,1161.843794,0.781250,1335.891493,0.195312,9.774712,0.373336
297,0,-0.948044,0.000127,0.948111,0.041406,0.321092,-1.098520,-0.926872,0.265217,0.003794,...,0.513398,0.195312,258.480403,0.195312,208.133562,0.390625,170.040254,0.390625,9.746223,0.185695
298,1,0.936373,0.000705,0.936749,0.071118,0.631287,-1.501612,0.978660,-0.105988,0.003051,...,0.895019,0.390625,625.158303,0.976562,178.686917,0.781250,520.712408,0.195312,9.765918,0.228332


there are too many features (105)

In [192]:
# drop high correlated features
correlation_matrix = data.corr()

In [193]:
# Create a mask to exclude the diagonal (self-correlations)
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)

# Find pairs of highly correlated features (correlation > 0.9)
high_corr_features = []
for column in correlation_matrix.columns:
    # Get the index of the features that are highly correlated with the current column
    high_corr = correlation_matrix[column][mask[:, correlation_matrix.columns.get_loc(column)]]
    for idx, corr_value in high_corr.items():
        if abs(corr_value) > 0.9:
            high_corr_features.append(column)

# Remove duplicates since a feature can be in the list multiple times
high_corr_features = list(set(high_corr_features))

# Drop the highly correlated features from the data
data_reduced = data.drop(columns=high_corr_features)

# Verify the result
print(f"Dropped columns with high correlation: {high_corr_features}")

Dropped columns with high correlation: ['Gyr_X_spectral_energy', 'Acc_X_rms', 'Quat_Z_max', 'Acc_Y_max', 'Acc_Z_rms', 'Quat_Y_max', 'Acc_Y_spectral_energy', 'Gyr_Y_spectral_energy', 'Gyr_X_range', 'Acc_SVM_var', 'Gyr_X_rms', 'Gyr_Z_spectral_energy', 'Mag_Z_range', 'Mag_Z_max', 'Gyr_X_var', 'Gyr_X_max', 'Gyr_Y_rms', 'Mag_X_max', 'Acc_Z_range', 'Gyr_Y_max', 'Acc_Z_mean', 'Gyr_Z_rms', 'Mag_Y_max', 'Quat_W_max', 'Acc_X_kurt', 'Acc_Z_spectral_energy', 'Gyr_Z_max']


In [194]:
data_reduced.head()

Unnamed: 0,condition,Quat_W_mean,Quat_W_var,Quat_W_rms,Quat_W_range,Quat_W_skew,Quat_W_kurt,Quat_X_mean,Quat_X_var,Quat_X_rms,...,Mag_Z_skew,Mag_Z_kurt,Acc_X_spectral_energy,Acc_X_dominant_freq,Acc_Y_dominant_freq,Acc_Z_dominant_freq,Gyr_X_dominant_freq,Gyr_Y_dominant_freq,Gyr_Z_dominant_freq,Acc_SVM_mean
0,0,-0.767555,0.000292,0.767746,0.085431,-1.085372,1.017358,0.137589,0.00138,0.142515,...,-1.508371,1.949684,0.325965,0.195312,0.195312,0.195312,4.101562,0.390625,0.390625,9.912421
1,1,-0.77457,0.0002,0.774699,0.08187,-0.455353,0.589303,0.10166,0.001259,0.107673,...,-0.98291,1.448894,0.173581,0.195312,0.195312,0.195312,4.101562,0.390625,0.390625,9.912909
2,2,-0.742027,0.000155,0.742132,0.093685,-2.249484,5.730606,0.17188,0.000701,0.173907,...,-2.479454,5.674852,0.212769,4.101562,4.101562,0.195312,1.953125,1.953125,3.90625,9.895929
3,0,-0.759761,0.000324,0.759974,0.088499,-1.873579,2.1585,0.108002,0.000697,0.111183,...,-2.071275,2.644999,0.187372,4.296875,4.296875,2.148438,4.296875,2.148438,0.195312,9.906819
4,1,-0.734621,0.000446,0.734924,0.089529,-1.572033,1.22208,0.094506,0.000972,0.099517,...,-1.836457,1.86022,0.193347,4.296875,0.195312,2.148438,2.148438,2.148438,0.390625,9.894033


In [195]:
data=data_reduced

In [188]:
#data.loc[data['condition'] == 0, 'condition'] = 1

In [196]:
# Split features and target
X = data.drop(columns=['condition'])  # Features (all columns except the target)
y = data['condition']  # Target (condition column)

In [197]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [198]:
# Standardize 
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the parameters learned from the training data
X_test_scaled = scaler.transform(X_test)

In [199]:
# Define the models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', probability=True, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', max_iter=300, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42)
}

In [200]:
# Train and evaluate each model using cross-validation
results = {}

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    
    # Perform cross-validation (5-fold) and compute accuracy scores
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')  # Use your full dataset (X, y)
    
    # Get mean and standard deviation of cross-validation accuracy scores
    mean_cv_score = cv_scores.mean()
    std_cv_score = cv_scores.std()
    
    print(f"Cross-Validation Accuracy for {model_name}: {mean_cv_score:.2f} ± {std_cv_score:.2f}")
    
    # Train the model on the full training data and evaluate on the test set
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Print classification report
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")
    
    # Store results
    results[model_name] = mean_cv_score

Evaluating Random Forest...
Cross-Validation Accuracy for Random Forest: 0.43 ± 0.01
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.17      0.15      0.16        20
           1       0.35      0.45      0.39        20
           2       0.50      0.40      0.44        20

    accuracy                           0.33        60
   macro avg       0.34      0.33      0.33        60
weighted avg       0.34      0.33      0.33        60

Evaluating Gradient Boosting...
Cross-Validation Accuracy for Gradient Boosting: 0.43 ± 0.05
Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

           0       0.28      0.35      0.31        20
           1       0.33      0.40      0.36        20
           2       0.64      0.35      0.45        20

    accuracy                           0.37        60
   macro avg       0.42      0.37      0.38        60
weighted avg       0.42      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-Validation Accuracy for Logistic Regression: 0.44 ± 0.05
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.47      0.45      0.46        20
           1       0.45      0.50      0.48        20
           2       0.47      0.45      0.46        20

    accuracy                           0.47        60
   macro avg       0.47      0.47      0.47        60
weighted avg       0.47      0.47      0.47        60


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

All these models have low accuracy,
Checking

Random Forrest

In [100]:
# Define the model
rf = RandomForestClassifier(random_state=42)

In [101]:
# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None]
}

In [162]:
# Perform grid search
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [168]:
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='accuracy')  # Use your full dataset (X, y)
    
# Get mean and standard deviation of cross-validation accuracy scores
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()
print(f"Cross-Validation Accuracy for {model_name}: {mean_cv_score:.2f} ± {std_cv_score:.2f}")

Cross-Validation Accuracy for Logistic Regression: 0.43 ± 0.01


XGBoost

In [106]:
# Define the model
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

NameError: name 'XGBClassifier' is not defined