In [19]:
from pathlib import Path
import os
import pandas as pd

STUDENT = 'yoric'

OUTLIERS_PATH = Path(r'C:\Users\yoric\ML4QS\ML4QS-GROUP99-Michael-FeatureEng\Python3Code\outliers2')
INTERMEDIATE_PATH = Path(r'C:\Users\yoric\ML4QS\ML4QS-GROUP99-Michael-FeatureEng\Python3Code\intermediate_datafiles')

os.chdir(r'C:\Users\yoric\ML4QS\ML4QS-GROUP99-Michael-FeatureEng\Python3Code')

EXPERIMENT_DIR = 'ML4QS-Vehicle-2'

from util.VisualizeDataset import VisualizeDataset
from Visualiser import Visualiser as Viz
from outlier_detector import OutlierDetector
from custom_imputer import CustomImputer
from util.util import ignore_actual_time, read_parquet, write_parquet
from DataLoader import PhyboxDatasetLoader

In [20]:
# Load intermediate df
intermediate_df = pd.read_parquet(INTERMEDIATE_PATH / 'ML4QS_imputed_results.parquet')

# Define leaking features to remove
leaking_features = [
    'location_phone_Latitude',
    'location_phone_Longitude', 
    'proximity_phone_Distance',
    'location_phone_Horizontal Accuracy',
    'location_phone_Height'
]

# Remove leaking features
intermediate_df = intermediate_df.drop(columns=leaking_features, errors='ignore')

In [22]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def calculate_magnitude(df, prefix):
    x_col = f"{prefix}_X"
    y_col = f"{prefix}_Y" 
    z_col = f"{prefix}_Z"
    return np.sqrt(df[x_col]**2 + df[y_col]**2 + df[z_col]**2)

def temporal_aggregation_no_leak(series, window_size=120, agg_functions=['mean', 'median', 'min', 'max', 'std']):
    result = pd.DataFrame()
    for func in agg_functions:
        if func == 'std':
            result[f"{series.name}_{func}"] = series.rolling(
                window=window_size, min_periods=1
            ).std().shift(1).fillna(0)
        else:
            result[f"{series.name}_{func}"] = series.rolling(
                window=window_size, min_periods=1
            ).agg(func).shift(1).fillna(method='bfill')
    return result

def calculate_direction_changes(direction_series, threshold=10):
    direction_diff = direction_series.diff().fillna(0)
    direction_diff = np.where(direction_diff > 180, direction_diff - 360, direction_diff)
    direction_diff = np.where(direction_diff < -180, direction_diff + 360, direction_diff)
    significant_changes = np.abs(direction_diff) > threshold
    return pd.Series(significant_changes.astype(int), index=direction_series.index)

def calculate_acceleration_switches(acceleration_magnitude, threshold=0.1):
    acc_diff = acceleration_magnitude.diff().fillna(0)
    acc_state = np.where(acc_diff > threshold, 1,  
                       np.where(acc_diff < -threshold, -1, 0))  
    switches = np.abs(np.diff(acc_state, prepend=acc_state[0])) > 0
    return pd.Series(switches.astype(int), index=acceleration_magnitude.index)

def process_transportation_data(df, window_size=120):
    processed_sessions = []

    for session_id, session_data in df.groupby('id'):
        session_df = session_data.copy()
        
        session_df['acc_phone_magnitude'] = calculate_magnitude(session_df, 'acc_phone')
        session_df['lin_acc_phone_magnitude'] = calculate_magnitude(session_df, 'lin_acc_phone') 
        session_df['gyr_phone_magnitude'] = calculate_magnitude(session_df, 'gyr_phone')
        
        session_df['direction_changes'] = calculate_direction_changes(session_df['location_phone_Direction'])
        session_df['acc_switches'] = calculate_acceleration_switches(session_df['acc_phone_magnitude'])
        session_df['lin_acc_switches'] = calculate_acceleration_switches(session_df['lin_acc_phone_magnitude'])
        session_df['rotation_switches'] = calculate_acceleration_switches(session_df['gyr_phone_magnitude'])
        
        features_to_aggregate = [
            'acc_phone_X', 'acc_phone_Y', 'acc_phone_Z',
            'lin_acc_phone_X', 'lin_acc_phone_Y', 'lin_acc_phone_Z',
            'gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z',
            'location_phone_Velocity', 'location_phone_Direction',
            'acc_phone_magnitude', 'lin_acc_phone_magnitude', 'gyr_phone_magnitude',
            'direction_changes', 'acc_switches', 'lin_acc_switches', 'rotation_switches'
        ]
        
        for feature in features_to_aggregate:
            if feature in session_df.columns:
                aggregated = temporal_aggregation_no_leak(session_df[feature], window_size)
                session_df = pd.concat([session_df, aggregated], axis=1)
        
        session_df['session_direction_change_rate'] = session_df['direction_changes'].mean()
        session_df['session_acc_switch_rate'] = session_df['acc_switches'].mean()
        session_df['session_lin_acc_switch_rate'] = session_df['lin_acc_switches'].mean()
        session_df['session_rotation_switch_rate'] = session_df['rotation_switches'].mean()
        session_df['session_avg_velocity'] = session_df['location_phone_Velocity'].mean()
        session_df['session_velocity_std'] = session_df['location_phone_Velocity'].std()
        
        processed_sessions.append(session_df)

    result_df = pd.concat(processed_sessions, ignore_index=True)
    return result_df

# === Example usage ===
# Load your intermediate dataset
# intermediate_df = pd.read_parquet('path/to/your/intermediate_file.parquet')

# Process the data without Fourier features
# session_features = process_transportation_data(intermediate_df, window_size=120)

# Save the processed dataset
#session_features.to_csv('C:/Users/yoric/ML4QS/session_features_imputed.csv', index=False)

# Load your intermediate dataset
#intermediate_df = pd.read_parquet('C:/Users/yoric/ML4QS/intermediate_df.parquet')

# Process the data
session_features = process_transportation_data(intermediate_df, window_size=120)

# Save the result
session_features.to_csv('C:/Users/yoric/ML4QS/session_features_imputed.csv', index=False)


In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# -------------------------------
# Load data and remove leaking features
# -------------------------------
df = pd.read_csv("/Users/yoric/ML4QS/session_features_imputed.csv")

print("Original DataFrame shape:", df.shape)
print("Available columns before feature removal:")
print(df.columns.tolist())

# Define leaking features to remove
leaking_features = [
    'location_phone_Latitude',
    'location_phone_Longitude', 
    'location_phone_Direction',
    'location_phone_Velocity',
    'proximity_phone_Distance',
    'location_phone_Horizontal Accuracy',
    'location_phone_Height',
    'location_phone_Vertical Accuracy',
    'lin_acc_phone_Z',
    'lin_acc_phone_Y'
]

# Remove leaking features
print(f"\nRemoving {len(leaking_features)} leaking features...")
features_found = [f for f in leaking_features if f in df.columns]
features_not_found = [f for f in leaking_features if f not in df.columns]

if features_found:
    print(f"Removing features: {features_found}")
    df = df.drop(columns=features_found)
else:
    print("No leaking features found in the dataset")

if features_not_found:
    print(f"Features not found (already removed?): {features_not_found}")

print(f"DataFrame shape after removing leaking features: {df.shape}")

# -------------------------------
# Inspect remaining columns
# -------------------------------
print("\nRemaining columns:")
print(df.columns.tolist())
print(f"\nDataFrame shape: {df.shape}")

# -------------------------------
# FIND ALL LABEL COLUMNS AUTOMATICALLY
# -------------------------------
# Find all label columns (assuming they start with 'label')
label_columns = [col for col in df.columns if col.startswith('label')]
print(f"\nFound label columns: {label_columns}")

# Extract transportation modes from label column names
transportation_modes = [col.replace('label', '') for col in label_columns]
print(f"Transportation modes: {transportation_modes}")

def create_target_label(row):
    """Create target label from all available label columns"""
    for i, col in enumerate(label_columns):
        if row.get(col, 0) == 1:
            return transportation_modes[i]
    return 'unknown'

df['transport_mode'] = df.apply(create_target_label, axis=1)

# Show all available transport modes and their counts
print(f"\nAll transport modes found in data:")
mode_counts = df['transport_mode'].value_counts()
print(mode_counts)

# Filter out 'unknown' samples (if any)
df = df[df['transport_mode'] != 'unknown']

print(f"\nAfter filtering out unknown samples: {df.shape[0]} samples")
print(f"Final class distribution:\n{df['transport_mode'].value_counts()}")

# Check session distribution by transport mode
print(f"\nSession distribution analysis:")
session_counts = df.groupby(['id', 'transport_mode']).size().unstack(fill_value=0)

sessions_per_mode = {}
for mode in df['transport_mode'].unique():
    sessions_with_mode = (session_counts[mode] > 0).sum() if mode in session_counts.columns else 0
    sessions_per_mode[mode] = sessions_with_mode
    print(f"{mode} sessions: {sessions_with_mode}")

print(f"Total unique sessions: {df['id'].nunique()}")

# Warn about modes with very few sessions
print(f"\n⚠️  Session distribution warnings:")
for mode, count in sessions_per_mode.items():
    if count < 3:
        print(f"WARNING: {mode} has only {count} session(s) - may cause issues in train/test split")

# -------------------------------
# Feature selection
# -------------------------------
# Find timestamp column (if exists)
time_like_columns = [col for col in df.columns if any(keyword in col.lower() for keyword in ['time', 'date', 'stamp'])]
timestamp_col = None
possible_timestamp_cols = ['timestamp', 'time', 'datetime', 'date_time', 'session_time']

for col in possible_timestamp_cols:
    if col in df.columns:
        timestamp_col = col
        break

if timestamp_col is None and time_like_columns:
    timestamp_col = time_like_columns[0]

# Ensure timestamp column is excluded from features to prevent data leakage
exclude_cols = ['id', 'transport_mode']
if timestamp_col:
    exclude_cols.append(timestamp_col)

feature_cols = [
    col for col in df.columns
    if not col.startswith('label')
    and col not in exclude_cols
]

print(f"\nUsing {len(feature_cols)} features (after removing leaking features)")
if timestamp_col:
    print(f"Timestamp column excluded: '{timestamp_col}'")

# Verify no leaking features remain
remaining_leaking = [f for f in leaking_features if f in feature_cols]
if remaining_leaking:
    print(f"⚠️  WARNING: Some leaking features still present: {remaining_leaking}")
else:
    print("✅ Confirmed: No leaking features in final feature set")

# -------------------------------
# Handle non-numeric columns (especially timedelta)
# -------------------------------
print(f"\nChecking data types in features...")
print(f"Feature dtypes (first 10):")
for col in feature_cols[:10]:
    if col in df.columns:
        dtype = df[col].dtype
        print(f"  {col}: {dtype}")
        
        if dtype == 'object':
            sample_values = df[col].dropna().head(3).tolist()
            print(f"    Sample values: {sample_values}")

# Convert timedelta columns to numeric (seconds)
timedelta_cols = []
problematic_cols = []

for col in feature_cols:
    if col in df.columns:
        if df[col].dtype == 'object':
            sample_val = str(df[col].dropna().iloc[0]) if len(df[col].dropna()) > 0 else ""
            if 'days' in sample_val and ':' in sample_val:
                print(f"Converting timedelta column '{col}' to seconds...")
                try:
                    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()
                    timedelta_cols.append(col)
                except Exception as e:
                    print(f"Failed to convert {col}: {e}")
                    problematic_cols.append(col)

if timedelta_cols:
    print(f"Converted {len(timedelta_cols)} timedelta columns to seconds: {timedelta_cols}")

# Remove non-numeric columns
non_numeric_cols = []
for col in feature_cols:
    if col in df.columns:
        if df[col].dtype == 'object' or not pd.api.types.is_numeric_dtype(df[col]):
            non_numeric_cols.append(col)

non_numeric_cols.extend(problematic_cols)
non_numeric_cols = list(set(non_numeric_cols))

if non_numeric_cols:
    print(f"⚠️  WARNING: Found non-numeric columns that will be excluded: {non_numeric_cols}")
    feature_cols = [col for col in feature_cols if col not in non_numeric_cols]
    print(f"Updated feature count: {len(feature_cols)}")

# -------------------------------
# IMPROVED SESSION-LEVEL SPLIT FOR ALL CLASSES
# -------------------------------
print("\n" + "="*50)
print("IMPLEMENTING SESSION-LEVEL SPLIT FOR ALL TRANSPORT MODES")
print("="*50)

# Get unique session IDs for each transport mode
session_split_info = {}
all_train_sessions = []
all_test_sessions = []

np.random.seed(42)

for mode in df['transport_mode'].unique():
    mode_sessions = df[df['transport_mode'] == mode]['id'].unique()
    mode_sessions_shuffled = np.random.permutation(mode_sessions)
    
    n_sessions = len(mode_sessions_shuffled)
    
    # Handle cases with very few sessions
    if n_sessions == 1:
        # If only 1 session, put it in training
        train_sessions = mode_sessions_shuffled
        test_sessions = []
        print(f"⚠️  {mode}: Only 1 session - putting in training set")
    elif n_sessions == 2:
        # If only 2 sessions, put 1 in each
        train_sessions = mode_sessions_shuffled[:1]
        test_sessions = mode_sessions_shuffled[1:]
        print(f"⚠️  {mode}: Only 2 sessions - 1 train, 1 test")
    else:
        # Normal 80/20 split
        split_idx = max(1, int(0.8 * n_sessions))  # Ensure at least 1 in training
        train_sessions = mode_sessions_shuffled[:split_idx]
        test_sessions = mode_sessions_shuffled[split_idx:]
    
    session_split_info[mode] = {
        'total': n_sessions,
        'train': len(train_sessions),
        'test': len(test_sessions)
    }
    
    all_train_sessions.extend(train_sessions)
    all_test_sessions.extend(test_sessions)

print(f"\nSession split breakdown:")
for mode, info in session_split_info.items():
    print(f"  {mode}: {info['total']} total -> {info['train']} train, {info['test']} test")

print(f"\nTotal sessions:")
print(f"  Train: {len(all_train_sessions)}")
print(f"  Test: {len(all_test_sessions)}")

# Create train and test datasets
df_train = df[df['id'].isin(all_train_sessions)].copy()
df_test = df[df['id'].isin(all_test_sessions)].copy()

# Verify no session overlap
train_session_set = set(df_train['id'].unique())
test_session_set = set(df_test['id'].unique())
session_overlap = train_session_set.intersection(test_session_set)

print(f"\n🔍 VERIFICATION:")
print(f"Train sessions: {len(train_session_set)}")
print(f"Test sessions: {len(test_session_set)}")
print(f"Session overlap: {len(session_overlap)}")

if len(session_overlap) == 0:
    print("✅ SUCCESS: No session overlap between train and test!")
else:
    print("🚨 ERROR: Session overlap detected!")

# Prepare features and targets
X_train = df_train[feature_cols]
X_test = df_test[feature_cols]
y_train = df_train['transport_mode']
y_test = df_test['transport_mode']

print(f"\nFinal dataset sizes:")
print(f"Train samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Train class distribution:\n{y_train.value_counts()}")
print(f"Test class distribution:\n{y_test.value_counts()}")

# Check for missing values
print(f"\nData quality check:")
print(f"Missing values in train features: {X_train.isnull().sum().sum()}")
print(f"Infinite values in train features: {np.isinf(X_train.select_dtypes(include=[np.number])).sum().sum()}")

# Final cleanup of any remaining object columns
object_cols = X_train.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    print(f"⚠️  Removing remaining object columns: {list(object_cols)}")
    X_train = X_train.select_dtypes(exclude=['object'])
    X_test = X_test.select_dtypes(exclude=['object'])

print(f"Final feature matrix shape: Train {X_train.shape}, Test {X_test.shape}")

# -------------------------------
# Preprocessing
# -------------------------------
print(f"\nStarting preprocessing...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Label encoding for multi-class classification
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print(f"\nLabel encoding for {len(label_encoder.classes_)}-class classification:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"  {class_name} = {i}")

print(f"Train labels distribution: {np.bincount(y_train_encoded)}")
print(f"Test labels distribution: {np.bincount(y_test_encoded)}")

# -------------------------------
# Train XGBoost for MULTI-CLASS Classification
# -------------------------------
dtrain = xgb.DMatrix(X_train_scaled, label=y_train_encoded)
dtest = xgb.DMatrix(X_test_scaled, label=y_test_encoded)

# Calculate scale_pos_weight for class imbalance (for multi-class, we'll use different approach)
class_counts = np.bincount(y_train_encoded)
n_classes = len(label_encoder.classes_)

params = {
    'objective': 'multi:softprob',
    'num_class': n_classes,
    'max_depth': 6,  # Reduced from 8 to prevent overfitting with more classes
    'learning_rate': 0.05,  # Slightly higher learning rate
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'eval_metric': 'mlogloss',
    'reg_alpha': 0.1,  # L1 regularization
    'reg_lambda': 1.0   # L2 regularization
}

print(f"\nTraining XGBoost model for {n_classes}-class classification...")
print(f"Classes: {list(label_encoder.classes_)}")

model = xgb.train(
    params,
    dtrain,
    num_boost_round=20,  # Increased rounds for more complex problem
    evals=[(dtrain, 'train'), (dtest, 'eval')],
    early_stopping_rounds=100,
    verbose_eval=50
)

# -------------------------------
# Evaluation
# -------------------------------
y_pred_proba = model.predict(dtest)
y_pred_encoded = np.argmax(y_pred_proba, axis=1)

# Convert back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred_encoded)
y_test_labels = label_encoder.inverse_transform(y_test_encoded)

accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f"\nTest Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred_labels, zero_division=0))

# Confusion Matrix
plt.figure(figsize=(12, 10))
unique_labels = sorted(label_encoder.classes_)
cm = confusion_matrix(y_test_labels, y_pred_labels, labels=unique_labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=unique_labels, 
            yticklabels=unique_labels)
plt.title(f'Confusion Matrix: {n_classes}-Class Transportation Mode Classification')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# -------------------------------
# Feature Importance
# -------------------------------
feature_importance = model.get_score(importance_type='weight')
final_feature_names = X_train.columns.tolist()
importance_df = pd.DataFrame({
    'feature': final_feature_names,
    'importance': [feature_importance.get(f'f{i}', 0) for i in range(len(final_feature_names))]
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
top_features = importance_df.head(20)
sns.barplot(data=top_features, x='importance', y='feature')
plt.title(f'Top 20 Most Important Features: {n_classes}-Class Classification')
plt.xlabel('Feature Importance (Weight)')
plt.tight_layout()
plt.show()

print("\nMost Important Features:")
print(importance_df.head(16))

# -------------------------------
# Additional Analysis
# -------------------------------
print(f"\n" + "="*50)
print("ADDITIONAL ANALYSIS")
print("="*50)

# Per-class accuracy analysis
print(f"\nPer-class Performance Analysis:")
for class_name in label_encoder.classes_:
    class_mask = y_test_labels == class_name
    n_samples = np.sum(class_mask)
    if n_samples > 0:
        class_accuracy = (y_pred_labels[class_mask] == class_name).mean()
        print(f"  {class_name:>12}: {class_accuracy:.4f} accuracy ({n_samples:>4} samples)")
    else:
        print(f"  {class_name:>12}: No test samples")

# Show prediction confidence distribution
max_probabilities = np.max(y_pred_proba, axis=1)
print(f"\nPrediction Confidence Distribution:")
print(f"Mean confidence: {max_probabilities.mean():.4f}")
print(f"Std confidence: {max_probabilities.std():.4f}")
print(f"Min confidence: {max_probabilities.min():.4f}")
print(f"Max confidence: {max_probabilities.max():.4f}")

# Sample predictions with probabilities
print(f"\nSample Predictions (showing top 3 probabilities for each):")
sample_indices = np.random.choice(len(y_test_labels), min(10, len(y_test_labels)), replace=False)
for i in sample_indices:
    true_label = y_test_labels[i]
    pred_label = y_pred_labels[i]
    
    # Get top 3 predictions for this sample
    top_3_indices = np.argsort(y_pred_proba[i])[-3:][::-1]
    top_3_probs = [(label_encoder.classes_[idx], y_pred_proba[i][idx]) for idx in top_3_indices]
    
    print(f"True: {true_label:>12}, Pred: {pred_label:>12}")
    print(f"  Top 3: {top_3_probs[0][0]}({top_3_probs[0][1]:.3f}), {top_3_probs[1][0]}({top_3_probs[1][1]:.3f}), {top_3_probs[2][0]}({top_3_probs[2][1]:.3f})")

print(f"\n🎯 FINAL SUMMARY:")
print(f"Multi-class classification accuracy: {accuracy:.4f}")
print(f"Number of classes: {n_classes}")
print(f"Classes: {list(label_encoder.classes_)}")
print(f"Features used: {X_train.shape[1]}")
print(f"Total samples: {len(df)} (Train: {len(df_train)}, Test: {len(df_test)})")

# Identify problematic classes (those with very low performance)
print(f"\n⚠️  Classes that may need attention:")
for class_name in label_encoder.classes_:
    class_mask = y_test_labels == class_name
    n_samples = np.sum(class_mask)
    if n_samples > 0:
        class_accuracy = (y_pred_labels[class_mask] == class_name).mean()
        if class_accuracy < 0.5:
            print(f"  {class_name}: {class_accuracy:.4f} accuracy (low performance)")
        elif n_samples < 10:
            print(f"  {class_name}: Only {n_samples} test samples (may need more data)")

Original DataFrame shape: (53043, 130)
Available columns before feature removal:
['timestamp', 'id', 'acc_phone_X', 'acc_phone_Y', 'acc_phone_Z', 'lin_acc_phone_X', 'lin_acc_phone_Y', 'lin_acc_phone_Z', 'gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z', 'location_phone_Velocity', 'location_phone_Direction', 'location_phone_Vertical Accuracy', 'mag_phone_X', 'mag_phone_Y', 'mag_phone_Z', 'labeltram', 'labeltrain', 'labelwalking', 'labelmetro', 'labelbus', 'labelcar', 'labelbike', 'original_time', 'time_diff', 'shifted_time', 'acc_phone_magnitude', 'lin_acc_phone_magnitude', 'gyr_phone_magnitude', 'direction_changes', 'acc_switches', 'lin_acc_switches', 'rotation_switches', 'acc_phone_X_mean', 'acc_phone_X_median', 'acc_phone_X_min', 'acc_phone_X_max', 'acc_phone_X_std', 'acc_phone_Y_mean', 'acc_phone_Y_median', 'acc_phone_Y_min', 'acc_phone_Y_max', 'acc_phone_Y_std', 'acc_phone_Z_mean', 'acc_phone_Z_median', 'acc_phone_Z_min', 'acc_phone_Z_max', 'acc_phone_Z_std', 'lin_acc_phone_X_mean', 'lin

Missing values in train features: 104202
Infinite values in train features: 0
Final feature matrix shape: Train (32564, 115), Test (20479, 115)

Starting preprocessing...

Label encoding for 7-class classification:
  bike = 0
  bus = 1
  car = 2
  metro = 3
  train = 4
  tram = 5
  walking = 6
Train labels distribution: [ 1204  3031  3197  3257 11187  1890  8798]
Test labels distribution: [2423 1915 4136 3061 4522  808 3614]

Training XGBoost model for 7-class classification...
Classes: ['bike', 'bus', 'car', 'metro', 'train', 'tram', 'walking']
[0]	train-mlogloss:1.77407	eval-mlogloss:1.89353
[19]	train-mlogloss:0.53408	eval-mlogloss:1.50738

Test Accuracy: 0.5839

Classification Report:
              precision    recall  f1-score   support

        bike       1.00      0.74      0.85      2423
         bus       0.00      0.00      0.00      1915
         car       0.00      0.00      0.00      4136
       metro       0.52      0.98      0.68      3061
       train       1.00      1.

In [None]:
#here we can see what the predictions on the test set actually were
predicted_counts = pd.Series(y_pred_labels).value_counts()
print("Frequency of predicted classes:")
print(predicted_counts)

In [None]:
#Labels were wrong first should be ok now
print(y_test.head())
print(y_pred_labels[:5])



In [None]:
import numpy as np
# For a few samples
for i in range(5):
    print(f"Sample {i} predicted probabilities: {y_pred_proba[i]}")
    print(f"Predicted class: {y_pred_labels[i]}, True class: {y_test.iloc[i]}")