<a href="https://colab.research.google.com/github/MilesCrossen/STEP-Bayesian-Model/blob/main/STEPBayesianProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving labels.txt to labels (1).txt
Saving 20080402060926.plt to 20080402060926.plt


In [None]:
#GPS FILE PARSING (RUN FIRST THX)
import csv
import os
from math import radians, cos, sin, asin, sqrt
from datetime import datetime


def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * asin(sqrt(a))
    return R * c


plt_filename = '20080402060926.plt'
basename = os.path.splitext(plt_filename)[0]
output_csv = f'output_{basename}.csv'

with open(plt_filename, 'r') as f:
    lines = f.readlines()

lines = lines[6:]

with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(
        ['Latitude', 'Longitude', 'Elevation', 'Timestamp', 'Distance (km)', 'Speed (km/h)', 'Acceleration (m/s²)'])

    prev_lat = prev_lon = prev_time = prev_speed = None

    for line in lines:
        parts = line.strip().split(',')
        if len(parts) < 7:
            continue
        lat = float(parts[0])
        lon = float(parts[1])
        ele = parts[3]
        date_str = parts[5]
        time_str = parts[6]
        dt_str = f"{date_str} {time_str}"

        try:
            time = datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S")
        except ValueError:
            continue

        distance = speed = acceleration = ''
        if prev_lat is not None:
            distance = haversine(prev_lat, prev_lon, lat, lon)
            time_diff = (time - prev_time).total_seconds() / 3600
            if time_diff > 0:
                speed = distance / time_diff

                if prev_speed is not None and prev_speed != '':
                    time_diff_sec = (time - prev_time).total_seconds()
                    if time_diff_sec > 0:
                        speed_ms = speed * 1000 / 3600
                        prev_speed_ms = prev_speed * 1000 / 3600
                        acceleration = abs((speed_ms - prev_speed_ms) / time_diff_sec)

        writer.writerow([lat, lon, ele, dt_str,
                         round(distance, 5) if distance else '',
                         round(speed, 2) if speed else '',
                         round(acceleration, 3) if acceleration else ''])
        prev_lat, prev_lon, prev_time = lat, lon, time
        if speed != '':
            prev_speed = speed

print(f"Done. Data written to {output_csv}")

Done. Data written to output_20080402060926.csv


In [None]:
#BAYESIAN UPDATE PARSING (RUN SECOND)
import pandas as pd
from scipy.stats import norm

def bayesian_classify_file(input_csv, output_csv, weight=0.5):
    df = pd.read_csv(input_csv)

    modes = {
        "walking": {"speed": (5, 1), "accel": (0.5, 0.4)},
        "cycling": {"speed": (15, 4), "accel": (1.0, 0.5)},
        "bus": {"speed": (20, 6), "accel": (1.5, 0.7)},
        "driving": {"speed": (50, 15), "accel": (2.5, 1.0)},
        "train": {"speed": (100, 20), "accel": (1.0, 0.6)}
    }

    EPSILON = 1e-8
    prior_probs = {mode: 1.0 / len(modes) for mode in modes}
    predictions = []

    for i, row in df.iterrows():
        try:
            speed = float(row['Speed (km/h)']) if pd.notna(row['Speed (km/h)']) and row['Speed (km/h)'] != '' else 0.0
            accel = float(row['Acceleration (m/s²)']) if pd.notna(row['Acceleration (m/s²)']) and row['Acceleration (m/s²)'] != '' else 0.0
        except (ValueError, KeyError):
            predictions.append('')
            continue

        likelihoods = {}
        for mode, stats in modes.items():
            mu_s, sigma_s = stats["speed"]
            mu_a, sigma_a = stats["accel"]

            l_speed = norm.pdf(speed, loc=mu_s, scale=sigma_s)
            l_accel = norm.pdf(accel, loc=mu_a, scale=sigma_a)

            weighted_likelihood = (l_speed ** weight) * (l_accel ** (1 - weight))
            likelihoods[mode] = max(weighted_likelihood, EPSILON)

        evidence = sum(max(prior_probs[mode], EPSILON) * likelihoods[mode] for mode in modes)

        posteriors = {}
        for mode in modes:
            prior = max(prior_probs[mode], EPSILON)
            posteriors[mode] = (likelihoods[mode] * prior) / evidence

        prior_probs = posteriors.copy()
        predicted_mode = max(posteriors, key=posteriors.get)
        predictions.append(predicted_mode)

    df['Mode'] = predictions
    df.to_csv(output_csv, index=False)
    print(f"Bayesian classification complete. Results written to {output_csv}")

if __name__ == "__main__":
    weight =1
    bayesian_classify_file('output_20080402060926.csv', 'output_20080402060926.csv', weight)

Bayesian classification complete. Results written to output_20080402060926.csv


In [None]:
#BROADOPTIMISATION -> RUN THIRD TO FIND OPTIMAL WEIGHTINGS
import pandas as pd
import numpy as np
import glob
from datetime import datetime
from scipy.stats import norm

# Mode normalization (same as TestingResults.py)
mode_normalization = {
    'walk': 'walking',
    'walking': 'walking',
    'taxi': 'taxi',
    'bus': 'bus',
    'train': 'train',
    'car': 'driving',
    'driving': 'driving',
    'bicycle': 'cycling',
    'bike': 'cycling',
    'cycling': 'cycling'
}


def normalize_mode(mode):
    return mode_normalization.get(str(mode).strip().lower(), str(mode).strip().lower())


def parse_labels_file(filename):
    """Parse labels file to get ground truth data"""
    labels = []
    try:
        with open(filename, 'r') as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) == 5:
                    start_str = f"{parts[0]} {parts[1]}"
                    end_str = f"{parts[2]} {parts[3]}"
                    mode = normalize_mode(parts[4])
                    try:
                        start = datetime.strptime(start_str, "%Y/%m/%d %H:%M:%S")
                        end = datetime.strptime(end_str, "%Y/%m/%d %H:%M:%S")
                        labels.append((start, end, mode))
                    except ValueError:
                        continue
    except FileNotFoundError:
        print(f"Warning: {filename} not found. Cannot evaluate accuracy.")
        return []
    return labels


def get_actual_mode(timestamp, labels):
    """Get actual mode for a given timestamp"""
    try:
        ts = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
    except ValueError:
        return None

    for start, end, mode in labels:
        if start <= ts <= end:
            return mode
    return None


def classify_with_weight(df, weight, labels):
    """Classify transport modes using weighted Bayesian approach"""
    # Gaussian profiles for each mode
    modes = {
        "walking": {"speed": (5, 1), "accel": (0.5, 0.4)},
        "cycling": {"speed": (15, 4), "accel": (1.0, 0.5)},
        "bus": {"speed": (20, 6), "accel": (1.5, 0.7)},
        "driving": {"speed": (50, 15), "accel": (2.5, 1.0)},
        "train": {"speed": (100, 20), "accel": (1.0, 0.6)}
    }

    EPSILON = 1e-8
    correct = 0
    total_evaluated = 0

    # Initialize uniform priors
    prior_probs = {mode: 1.0 / len(modes) for mode in modes}

    prev_speed = None
    prev_time = None

    for i, row in df.iterrows():
        try:
            speed = float(row['Speed (km/h)']) if pd.notna(row['Speed (km/h)']) and row['Speed (km/h)'] != '' else 0.0
            timestamp = row['Timestamp']
            time = pd.to_datetime(timestamp)
        except (ValueError, KeyError):
            continue

        # Calculate acceleration
        if prev_speed is not None and prev_time is not None:
            time_diff = (time - prev_time).total_seconds()
            if time_diff > 0:
                accel = abs((speed - prev_speed) / time_diff)
            else:
                accel = 0.0
        else:
            accel = 0.0

        # Calculate likelihoods for each mode
        likelihoods = {}
        for mode, stats in modes.items():
            mu_s, sigma_s = stats["speed"]
            mu_a, sigma_a = stats["accel"]

            # Calculate individual likelihoods
            l_speed = norm.pdf(speed, loc=mu_s, scale=sigma_s)
            l_accel = norm.pdf(accel, loc=mu_a, scale=sigma_a)

            # Apply weighting: weight for speed, (1-weight) for acceleration
            weighted_likelihood = (l_speed ** weight) * (l_accel ** (1 - weight))
            likelihoods[mode] = max(weighted_likelihood, EPSILON)

        # Calculate evidence (normalization factor)
        evidence = sum(max(prior_probs[mode], EPSILON) * likelihoods[mode] for mode in modes)

        # Calculate posteriors using Bayes' theorem
        posteriors = {}
        for mode in modes:
            prior = max(prior_probs[mode], EPSILON)
            posteriors[mode] = (likelihoods[mode] * prior) / evidence

        # Update priors for next iteration
        prior_probs = posteriors.copy()

        # Get prediction (mode with highest posterior)
        predicted_mode = max(posteriors, key=posteriors.get)

        # Get actual mode and evaluate if available
        actual_mode = get_actual_mode(timestamp, labels)
        if actual_mode:  # Only evaluate if we have ground truth
            total_evaluated += 1
            if normalize_mode(predicted_mode) == normalize_mode(actual_mode):
                correct += 1

        prev_speed = speed
        prev_time = time

    accuracy = (correct / total_evaluated * 100) if total_evaluated > 0 else 0
    return accuracy


def grid_search_on_files():
    """Perform grid search and output results to specific file"""
    # Look for processed CSV files
    files = glob.glob("output_*.csv")
    if not files:
        print("No output_*.csv files found. Make sure you've run GPSFileParsing.py first.")
        return

    weights = np.arange(0.0, 1.01, 0.05)
    all_results = []

    print("Starting grid search optimization...")
    print(f"Testing {len(weights)} different weight values on {len(files)} files")
    print("-" * 60)

    for file in files:
        print(f"\nProcessing {file}...")

        try:
            df = pd.read_csv(file)
        except Exception as e:
            print(f"Error reading {file}: {e}")
            continue

        # Get the base filename for output naming
        base_name = file.replace('output_', '').replace('.csv', '')

        # Try to find corresponding labels file
        labels_file = 'labels.txt'
        labels = parse_labels_file(labels_file)

        if not labels:
            print(f"No labels found for {file}. Skipping accuracy evaluation.")
            continue

        best_weight = 0.0
        best_accuracy = 0.0
        file_results = []

        for weight in weights:
            try:
                accuracy = classify_with_weight(df, weight, labels)
                file_results.append({
                    'File': file,
                    'Weight': round(weight, 2),
                    'Accuracy': round(accuracy, 2)
                })

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_weight = weight

            except Exception as e:
                print(f"Error with weight {weight}: {e}")
                continue

        all_results.extend(file_results)
        print(f"Best result for {file}: Weight={best_weight:.2f}, Accuracy={best_accuracy:.2f}%")

        # Save results to file named after the specific dataset
        output_filename = f"Optimal_Weightings_{base_name}.csv"
        results_df = pd.DataFrame(file_results)
        results_df.to_csv(output_filename, index=False)
        print(f"Results saved to {output_filename}")

    if all_results:
        print(f"\nGrid search complete!")

        # Display summary for the user
        if len(files) == 1:
            best_result = max(all_results, key=lambda x: x['Accuracy'])
            print(f"Optimal speed weighting:")
            print(f"Weight: {best_result['Weight']}")
            print(f"Accuracy: {best_result['Accuracy']}%")
        else:
            # If multiple files, show overall summary
            results_df = pd.DataFrame(all_results)
            summary_stats = results_df.groupby('Weight')['Accuracy'].agg(['mean', 'max']).round(2)
            best_overall_weight = summary_stats['mean'].idxmax()
            best_overall_accuracy = summary_stats.loc[best_overall_weight, 'mean']

            print(f"best weight: {best_overall_weight}")
            print(f"Average accuracy: {best_overall_accuracy:.2f}%")
    else:
        print("No results generated. Check data files+labels.")


if __name__ == "__main__":
    grid_search_on_files()

Starting grid search optimization...
Testing 21 different weight values on 1 files
------------------------------------------------------------

Processing output_20080402060926.csv...
Best result for output_20080402060926.csv: Weight=1.00, Accuracy=70.92%
Results saved to Optimal_Weightings_20080402060926.csv

Grid search complete!
Optimal speed weighting:
Weight: 1.0
Accuracy: 70.92%


In [None]:
#TESTINGRESULTS.PY -> RUN FOURTH W/OPTIMAL WEIGHTS
import pandas as pd
from datetime import datetime
from scipy.stats import norm

mode_normalization = {
    'walk': 'walking',
    'walking': 'walking',
    'taxi': 'taxi',
    'bus': 'bus',
    'train': 'train',
    'car': 'driving',
    'driving': 'driving',
    'bicycle': 'cycling',
    'bike': 'cycling',
    'cycling': 'cycling'
}



def normalize_mode(mode):
    return mode_normalization.get(str(mode).strip().lower(), str(mode).strip().lower())


def parse_labels_file(filename):
    labels = []
    with open(filename, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 5:
                start_str = f"{parts[0]} {parts[1]}"
                end_str = f"{parts[2]} {parts[3]}"
                mode = normalize_mode(parts[4])
                try:
                    start = datetime.strptime(start_str, "%Y/%m/%d %H:%M:%S")
                    end = datetime.strptime(end_str, "%Y/%m/%d %H:%M:%S")
                    labels.append((start, end, mode))
                except ValueError:
                    continue
    return labels


def bayesian_classify(df, speed_weight=0.7, accel_weight=0.3):
    modes = {
        "walking": {"speed": (4, 1.5), "accel": (0.5, 0.4)},
        "cycling": {"speed": (14, 4), "accel": (1.0, 0.5)},
        "bus": {"speed": (25, 10), "accel": (1.5, 0.7)},
        "driving": {"speed": (40, 18), "accel": (2.5, 1.0)},
        "train": {"speed": (55, 20), "accel": (1.0, 0.6)}
    }

    EPSILON = 1e-8
    prior_probs = {mode: 1.0 / len(modes) for mode in modes}
    predictions = []

    prev_speed = None
    prev_time = None

    for i, row in df.iterrows():
        try:
            speed = float(row['Speed (km/h)']) if pd.notna(row['Speed (km/h)']) and row['Speed (km/h)'] != '' else 0.0
            time = pd.to_datetime(row['Timestamp'])
        except (ValueError, KeyError):
            predictions.append('')
            continue

        # Calculate acceleration on-the-fly
        if prev_speed is not None and prev_time is not None:
            time_diff = (time - prev_time).total_seconds()
            if time_diff > 0:
                accel = abs((speed - prev_speed) / time_diff)
            else:
                accel = 0.0
        else:
            accel = 0.0

        likelihoods = {}
        for mode, stats in modes.items():
            mu_s, sigma_s = stats["speed"]
            mu_a, sigma_a = stats["accel"]

            l_speed = norm.pdf(speed, loc=mu_s, scale=sigma_s)
            l_accel = norm.pdf(accel, loc=mu_a, scale=sigma_a)

            weighted_likelihood = (l_speed ** speed_weight) * (l_accel ** accel_weight)
            likelihoods[mode] = max(weighted_likelihood, EPSILON)

        evidence = sum(max(prior_probs[mode], EPSILON) * likelihoods[mode] for mode in modes)

        posteriors = {}
        for mode in modes:
            prior = max(prior_probs[mode], EPSILON)
            posteriors[mode] = (likelihoods[mode] * prior) / evidence

        prior_probs = posteriors.copy()
        predicted_mode = max(posteriors, key=posteriors.get)
        predictions.append(predicted_mode)

        prev_speed = speed
        prev_time = time

    return predictions




def evaluate_predictions(predicted_csv, labels_txt, speed_weight=0.7, accel_weight=0.3):
    df = pd.read_csv(predicted_csv)
    labels = parse_labels_file(labels_txt)
    print(f"Using weights: speed={speed_weight:.2f}, acceleration={accel_weight:.2f}")
    predictions = bayesian_classify(df, speed_weight, accel_weight)
    df['Mode'] = predictions
    df['Actual Mode'] = ''
    df['Correct'] = ''

    correct = 0
    total = 0
    last_correct = 'Unknown'

    for i, row in df.iterrows():
        try:
            ts = datetime.strptime(row['Timestamp'], "%Y-%m-%d %H:%M:%S")
        except (ValueError, KeyError):
            continue

        actual_mode = ''
        for start, end, mode in labels:
            if start <= ts <= end:
                actual_mode = mode
                break

        df.at[i, 'Actual Mode'] = actual_mode
        pred_mode = normalize_mode(row.get('Mode', ''))

        if actual_mode:
            if pred_mode == actual_mode:
                df.at[i, 'Correct'] = 'Yes'
                correct += 1
            else:
                df.at[i, 'Correct'] = 'No'
            total += 1
        else:
            df.at[i, 'Correct'] = 'Unknown'

    last_row = df.iloc[-1]
    if last_row['Correct'] == 'Yes':
        last_correct = 'CORRECT.'
    elif last_row['Correct'] == 'No':
        last_correct = 'WRONG'
    else:
        last_correct = 'Final prediction could not be evaluated (Unknown).'

    df.to_csv(predicted_csv, index=False)
    accuracy = (correct / total * 100) if total > 0 else 0
    print(f"\nEvaluation complete. Results updated in: {predicted_csv}")
    print(f"Accuracy:{accuracy:.2f}% ({correct} correct of {total} labeled rows)")
    print(last_correct)


if __name__ == "__main__":
    speed_weight = 1
    accel_weight = 0
    evaluate_predictions('output_20080402060926.csv', 'labels.txt', speed_weight, accel_weight)

Using weights: speed=1.00, acceleration=0.00

Evaluation complete. Results updated in: output_20080402060926.csv
Accuracy:76.89% (772 correct of 1004 labeled rows)
WRONG
