In [1]:
import os
import glob
import pandas as pd
import numpy as np
from gtda.time_series import SingleTakensEmbedding

# Path to your denoised CSVs
input_base_dir = r'E:\forecasting1\teams_perspective'

# Where to save the results
output_base_dir = r'E:\forecasting1\forecastingfolder1\teamlpp'
os.makedirs(output_base_dir, exist_ok=True)

# Pattern to match all CSV files
csv_pattern = os.path.join(input_base_dir, '*.csv')

# Get a list of all CSV files in the input directory
csv_files = glob.glob(csv_pattern)

if not csv_files:
    print(f"No CSV files found in the directory: {input_base_dir}")
    exit(1)

# Iterate over each CSV file
for csv_path in csv_files:
    # Extract the team (file) name from the CSV file name
    team_name = os.path.splitext(os.path.basename(csv_path))[0]
    print(f"\nProcessing file: {team_name}")
    
    # Results CSV: e.g., E:\chaospaper\lyapunov_result\Atlanta.csv
    output_csv_path = os.path.join(output_base_dir, f'{team_name}.csv')
    
    # List to store results for this file
    results = []
    
    # Load the CSV
    try:
        data = pd.read_csv(csv_path)
        print(f"Successfully loaded data from {csv_path}")
    except FileNotFoundError:
        err_msg = f"File not found: {csv_path}"
        print(err_msg)
        results.append({
            'Column_Name': 'All',
            'Optimal_Time_Delay': 'N/A',
            'Optimal_Embedding_Dimension': 'N/A',
            'Status': err_msg
        })
        pd.DataFrame(results).to_csv(output_csv_path, index=False)
        continue
    except Exception as e:
        err_msg = f"Error reading file: {e}"
        print(err_msg)
        results.append({
            'Column_Name': 'All',
            'Optimal_Time_Delay': 'N/A',
            'Optimal_Embedding_Dimension': 'N/A',
            'Status': err_msg
        })
        pd.DataFrame(results).to_csv(output_csv_path, index=False)
        continue
    
    # Iterate over *all* columns in the CSV
    for column_name in data.columns:
        print(f"  -> Processing column: {column_name}")
        
        # Convert the column to numeric, coerce errors to NaN, then drop NaNs
        time_series_data = pd.to_numeric(data[column_name], errors='coerce').dropna()
        
        if time_series_data.empty:
            # No valid numeric data
            msg = "No numeric data in column."
            print(f"     {msg}")
            results.append({
                'Column_Name': column_name,
                'Optimal_Time_Delay': 'N/A',
                'Optimal_Embedding_Dimension': 'N/A',
                'Status': msg
            })
            continue
        
        # Convert to NumPy
        time_series_array = time_series_data.to_numpy()
        
        # Arbitrary embedding parameters
        dimension = 10
        time_delay = 10
        embedding_size = dimension * time_delay
        
        # Check if embedding size is too large
        if embedding_size >= len(time_series_array):
            error_message = (
                f"Embedding size ({embedding_size}) >= length of series ({len(time_series_array)})"
            )
            print(f"     {error_message}")
            results.append({
                'Column_Name': column_name,
                'Optimal_Time_Delay': 'Error',
                'Optimal_Embedding_Dimension': 'Error',
                'Status': error_message
            })
            continue
        
        # Create the Takens embedding
        embedding = SingleTakensEmbedding(
            parameters_type='search',
            dimension=dimension,
            time_delay=time_delay,
            n_jobs=1  # single-threaded
        )
        
        try:
            # Fit and transform
            embedding.fit_transform(time_series_array.reshape(-1, 1))
            
            # Retrieve chosen parameters
            optimal_time_delay = embedding.time_delay_
            optimal_dimension = embedding.dimension_
            
            results.append({
                'Column_Name': column_name,
                'Optimal_Time_Delay': optimal_time_delay,
                'Optimal_Embedding_Dimension': optimal_dimension,
                'Status': 'Success'
            })
            
            print(f"     Optimal time delay: {optimal_time_delay}, dimension: {optimal_dimension}")
            
        except Exception as e:
            error_message = f"An error occurred during embedding: {e}"
            print(f"     {error_message}")
            results.append({
                'Column_Name': column_name,
                'Optimal_Time_Delay': 'Error',
                'Optimal_Embedding_Dimension': 'Error',
                'Status': error_message
            })
    
    # Convert results to DataFrame and save
    results_df = pd.DataFrame(results)
    try:
        results_df.to_csv(output_csv_path, index=False)
        print(f"Saved embedding parameters to {output_csv_path}")
    except Exception as e:
        print(f"An error occurred while saving results for {team_name}: {e}")



Processing file: Arsenal_perspective
Successfully loaded data from E:\forecasting1\teams_perspective\Arsenal_perspective.csv
  -> Processing column: FTGoalsFor
     Optimal time delay: 1, dimension: 2
  -> Processing column: FTGoalsAgainst
     Optimal time delay: 6, dimension: 2
  -> Processing column: TeamGS
     Optimal time delay: 9, dimension: 6
  -> Processing column: TeamGC
     Optimal time delay: 10, dimension: 10
  -> Processing column: TeamPoints
     Optimal time delay: 10, dimension: 10
  -> Processing column: MatchWeek
     Optimal time delay: 10, dimension: 10
  -> Processing column: TeamFormPts
     Optimal time delay: 5, dimension: 7
  -> Processing column: WinStreak3
     Optimal time delay: 3, dimension: 2
  -> Processing column: WinStreak5
     Optimal time delay: 6, dimension: 2
  -> Processing column: LossStreak3
     Optimal time delay: 8, dimension: 2
  -> Processing column: LossStreak5
     Optimal time delay: 1, dimension: 2
  -> Processing column: TeamGD
   