Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLM analysis #6

Draft
wants to merge 26 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ba996a6
feat: Add LLM analysis option to validate_tcx_file
Lucs1590 May 18, 2024
c4f1ee0
chore: Add dotenv package and load environment variables
Lucs1590 May 18, 2024
37cb873
chore: Update dependencies in requirements.txt
Lucs1590 May 18, 2024
c38b18b
feat: Add AI Assistant prompt for LLM analysis
Lucs1590 May 18, 2024
49194ac
chore: Update langchain_core dependency to version 0.2.0
Lucs1590 May 18, 2024
1e86f19
feat: Improve LLM analysis prompt in perform_llm_analysis
Lucs1590 May 18, 2024
eb0da97
feat: Add OpenAI language model for LLM analysis
Lucs1590 May 18, 2024
8abe09b
chore: Update langchain_openai dependency to version 0.1.7
Lucs1590 May 18, 2024
bcb406f
refactor: Preprocess trackpoints data in perform_llm_analysis
Lucs1590 May 18, 2024
03698c4
feat: running euclidean distance to remove rows
Lucs1590 May 23, 2024
b9c6aa4
refactor: improving euclidian calc performance
Lucs1590 May 23, 2024
b846dc3
fix: model type
Lucs1590 May 23, 2024
4c6bfaa
refactor: output type
Lucs1590 May 23, 2024
1a55cc6
feat: change percentage range
Lucs1590 May 23, 2024
a3c83d6
feat: allow null path
Lucs1590 May 26, 2024
9c29a62
refactor: improve ask_file_path function
Lucs1590 May 26, 2024
17bb14d
refactor: Convert Time column to Hour:Minute:Second format
Lucs1590 May 26, 2024
00fb4fd
refactor: renaming df to dataframe
Lucs1590 May 26, 2024
0e3fd93
style: removing unused lib
Lucs1590 May 26, 2024
f53fed7
style: split statements lines
Lucs1590 May 26, 2024
25a8d63
build: add missing libs
Lucs1590 Jun 1, 2024
361c111
feat: update params
Lucs1590 Jul 5, 2024
6edf615
Merge branch 'main' of github.com:Lucs1590/strava-to-trainingpeaks in…
Lucs1590 Jul 6, 2024
f9394de
feat: add tqdm to euclidian distance calcule
Lucs1590 Jul 6, 2024
b5f1478
build: updating libraries
Lucs1590 Jul 6, 2024
8a172e4
style: sorting response
Lucs1590 Jul 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
defusedxml>=0.7.1
questionary>=2.0.1
tcxreader>=0.4.10
defusedxml==0.7.1
langchain_core==0.2.11
langchain_openai==0.1.14
numpy==1.24.2
pandas==2.2.2
python-dotenv==1.0.1
questionary==2.0.1
scipy==1.13.1
tcxreader==0.4.10
tqdm==4.65.0
158 changes: 143 additions & 15 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,22 @@
import logging
import webbrowser

from typing import Tuple
from defusedxml.minidom import parseString

import questionary

import numpy as np
import pandas as pd

from tqdm import tqdm
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate
from scipy.spatial.distance import squareform, pdist
from tcxreader.tcxreader import TCXReader


load_dotenv()
logger = logging.getLogger()

if not logger.handlers:
Expand All @@ -33,15 +43,22 @@ def main():

file_path = ask_file_path(file_location)

if sport in ["Swim", "Other"]:
logger.info("Formatting the TCX file to be imported to TrainingPeaks")
format_to_swim(file_path)
elif sport in ["Bike", "Run"]:
logger.info("Validating the TCX file")
validate_tcx_file(file_path)
else:
logger.error("Invalid sport selected")
raise ValueError("Invalid sport selected")
if file_path:
if sport in ["Swim", "Other"]:
logger.info(
"Formatting the TCX file to be imported to TrainingPeaks"
)
format_to_swim(file_path)
elif sport in ["Bike", "Run"]:
logger.info("Validating the TCX file")
_, tcx_data = validate_tcx_file(file_path)
if ask_llm_analysis():
plan = ask_training_plan()
logger.info("Performing LLM analysis")
perform_llm_analysis(tcx_data, sport, plan)
else:
logger.error("Invalid sport selected")
raise ValueError("Invalid sport selected")

indent_xml_file(file_path)
logger.info("Process completed successfully!")
Expand Down Expand Up @@ -82,11 +99,22 @@ def download_tcx_file(activity_id: str, sport: str) -> None:
raise ValueError("Error opening the browser") from err


def ask_file_path(file_location) -> str:
question = "Enter the path to the TCX file:" if file_location == "Provide path" else "Check if the TCX file was downloaded and then enter the path to the file:"
def ask_file_path(file_location: str) -> str:
if file_location == "Provide path":
question = "Enter the path to the TCX file:"

def validation(path):
return os.path.isfile(path)
else:
question = "Check if the TCX was downloaded and validate the file:"

def validation(path):
return os.path.isfile(path) or path == ''

return questionary.path(
question,
validate=os.path.isfile
validate=validation,
only_directories=False
).ask()


Expand Down Expand Up @@ -116,7 +144,7 @@ def write_xml_file(file_path: str, xml_str: str) -> None:
xml_file.write(xml_str)


def validate_tcx_file(file_path: str) -> bool:
def validate_tcx_file(file_path: str) -> Tuple[bool, TCXReader]:
xml_str = read_xml_file(file_path)
if not xml_str:
logger.error("The TCX file is empty.")
Expand All @@ -129,12 +157,112 @@ def validate_tcx_file(file_path: str) -> bool:
"The TCX file is valid. You covered a significant distance in this activity, with %d meters.",
data.distance
)
return True
return True, data
except Exception as err:
logger.error("Invalid TCX file.")
raise ValueError(f"Error reading the TCX file: {err}") from err


def ask_llm_analysis() -> str:
return questionary.confirm(
"Do you want to perform AI analysis?",
default=False
).ask()


def ask_training_plan() -> str:
return questionary.text(
"Was there anything planned for this training?"
).ask()


def perform_llm_analysis(data: TCXReader, sport: str, plan: str) -> str:
dataframe = preprocess_trackpoints_data(data)

prompt = """SYSTEM: You are an AI Assistant that helps athletes to improve their performance.
Based on the following csv data that is related to a {sport} training session, carry out an analysis highlighting positive points, where the athlete did well and where he did poorly and what he can do to improve in the next {sport}.
<csv_data>
{data}
</csv_data>
"""
prompt += "plan: {plan}" if plan else ""
prompt = PromptTemplate.from_template(prompt)
prompt = prompt.format(
sport=sport,
data=dataframe.to_csv(index=False),
plan=plan
)

openai_llm = ChatOpenAI(
openai_api_key=os.getenv("OPENAI_API_KEY"),
model_name="gpt-4o",
max_tokens=1500,
temperature=0.6,
max_retries=5
)
response = openai_llm.invoke(prompt)
logger.info("AI analysis completed successfully.")
logger.info("\nAI response:\n %s \n", response.content)
return response.content


def preprocess_trackpoints_data(data):
dataframe = pd.DataFrame(data.trackpoints_to_dict())
dataframe.rename(
columns={
"distance": "Distance_Km",
"time": "Time",
"Speed": "Speed_Kmh"
}, inplace=True
)
dataframe["Time"] = dataframe["Time"].apply(lambda x: x.value / 10**9)
dataframe["Distance_Km"] = round(dataframe["Distance_Km"] / 1000, 2)
dataframe["Speed_Kmh"] = dataframe["Speed_Kmh"] * 3.6
dataframe["Pace"] = round(
dataframe["Speed_Kmh"].apply(lambda x: 60 / x if x > 0 else 0),
2
)
if dataframe["cadence"].isnull().sum() >= len(dataframe) / 2:
dataframe.drop(columns=["cadence"], inplace=True)

dataframe = dataframe.drop_duplicates()
dataframe = dataframe.reset_index(drop=True)
dataframe = dataframe.dropna()

if dataframe.shape[0] > 4000:
dataframe = run_euclidean_dist_deletion(dataframe, 0.55)
elif dataframe.shape[0] > 1000:
dataframe = run_euclidean_dist_deletion(dataframe, 0.35)
else:
dataframe = run_euclidean_dist_deletion(dataframe, 0.10)

dataframe["Time"] = pd.to_datetime(
dataframe["Time"],
unit='s'
).dt.strftime('%H:%M:%S')

return dataframe


def run_euclidean_dist_deletion(dataframe: pd.DataFrame, percentage: float) -> pd.DataFrame:
dists = pdist(dataframe, metric='euclidean')
dists = squareform(dists)
np.fill_diagonal(dists, np.inf)

total_rows = int(percentage * len(dataframe))
with tqdm(total=total_rows, desc="Removing similar points") as pbar:
for _ in range(total_rows):
min_idx = np.argmin(dists)
row, col = np.unravel_index(min_idx, dists.shape)
dists[row, :] = np.inf
dists[:, col] = np.inf
dataframe = dataframe.drop(row)
pbar.update(1)

dataframe = dataframe.reset_index(drop=True)
return dataframe


def indent_xml_file(file_path: str) -> None:
try:
with open(file_path, "r", encoding='utf-8') as xml_file:
Expand Down
Loading