In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name, col, regexp_extract

spark = SparkSession.builder.getOrCreate()

# Path to your single CSV file
file_path = "dbfs:/FileStore/fit/output_test2.csv"

# Read the CSV with headers
df = spark.read.option("header", "true").csv(file_path)

# Add a column showing the source file name (kept for structure, even if it's only one file)
df = df.withColumn("workout", input_file_name())

# Use already-renamed columns from the merged CSV
df = df.selectExpr(
    "`timestamp[s]`", 
    "`distance[m]`", 
    "`heart_rate[bpm]`", 
    "`power[watts]`", 
    "`speed[m/s]`", 
    "`ID` as workout"
)

# Optionally cast numeric columns to proper data types
df = df.withColumn("timestamp[s]", col("timestamp[s]").cast("long")) \
       .withColumn("distance[m]", col("distance[m]").cast("double")) \
       .withColumn("heart_rate[bpm]", col("heart_rate[bpm]").cast("int")) \
       .withColumn("power[watts]", col("power[watts]").cast("int")) \
       .withColumn("speed[m/s]", col("speed[m/s]").cast("double"))

# Show output
# df.show(500)
display(df.limit(400))


timestamp[s],distance[m],heart_rate[bpm],power[watts],speed[m/s],workout
1065987978,0.26,0,165,0.977,10738014183_data.csv
1065987979,2.1,0,201,2.329,10738014183_data.csv
1065987980,4.87,0,117,2.969,10738014183_data.csv
1065987981,7.92,0,122,3.335,10738014183_data.csv
1065987982,11.52,0,141,3.717,10738014183_data.csv
1065987983,15.44,0,142,4.071,10738014183_data.csv
1065987984,19.63,0,172,4.428,10738014183_data.csv
1065987985,24.24,0,151,4.752,10738014183_data.csv
1065987986,29.13,0,121,4.982,10738014183_data.csv
1065987987,34.14,0,117,5.156,10738014183_data.csv


In [0]:
# Show raw uncasted data for one file
df_raw = spark.read.option("header", "true").csv("dbfs:/FileStore/fit/14755435914_data.csv")
display(df_raw.limit(100))


record.timestamp[s],record.position_lat[semicircles],record.position_long[semicircles],record.distance[m],record.altitude[m],record.speed[m/s],record.power[watts],record.heart_rate[bpm],record.cadence[rpm],record.enhanced_altitude[m],record.enhanced_speed[m/s],record.developer.1.target_power[watts],_c12
1110382454,644129344,-18397810,0.28,124.60000000000002,10.837,252,147,91,124.60000000000002,10.837,0,
1110382460,644134720,-18392284,4.44,123.79999999999995,12.104,276,151,91,123.79999999999995,12.104,0,
1110382461,644134720,-18392310,5.01,123.79999999999995,12.12,343,151,92,123.79999999999995,12.12,0,
1110382462,644136000,-18392858,17.32,123.39999999999998,12.452,343,151,92,123.39999999999998,12.452,0,
1110382463,644137408,-18393430,30.07,123.20000000000005,12.788,369,152,91,123.20000000000005,12.788,0,
1110382464,644138688,-18394002,42.82,123.0,13.069,326,153,85,123.0,13.069,0,
1110382465,644140096,-18394588,56.01,122.60000000000002,13.298,293,153,85,122.60000000000002,13.298,0,
1110382466,644141504,-18395156,69.44,122.39999999999998,13.553,351,154,86,122.39999999999998,13.553,0,
1110382467,644142912,-18395726,82.98,122.0,13.567,277,154,86,122.0,13.567,0,
1110382468,644144320,-18396298,96.61,121.79999999999995,13.698,299,154,86,121.79999999999995,13.698,0,


In [0]:
filtered=df[df['timestamp[s]'] == 1084307851]
filtered.show()


+------------+-----------+---------------+------------+----------+--------------------+
|timestamp[s]|distance[m]|heart_rate[bpm]|power[watts]|speed[m/s]|             workout|
+------------+-----------+---------------+------------+----------+--------------------+
|  1084307851|     735.28|            103|         174|      5.57|12153451545_data.csv|
+------------+-----------+---------------+------------+----------+--------------------+



In [0]:
# Delete Directory
dbutils.fs.rm("FileStore/fit", recurse=True)