## Import des données et pré-traitement

In [None]:
import pandas as pd
from src.utility.descriptive_statistics import descriptive_statistics

df = pd.read_pickle('src/data/panel_data.pkl')
df["index"] = pd.to_datetime(df["index"])

outlier_dates = [pd.Timestamp('2001-09-11')]
df = df[~df['index'].isin(outlier_dates)]

df = df[(df['index'] >= '1988-01-01') & (df['index'] <= '2017-01-01')]

for col in df.columns[1:]:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df

## Statistiques descriptives

In [None]:
import numpy as np

original_stats, log_stats = descriptive_statistics(df["Maturity 1"])

stats_keys = ['Mean', 'Median', 'Minimum', 'Maximum', 'Std deviation', 'Skewness', 'Kurtosis', 'Autocorrelation', 'ADF test p-value (10 lags)', 'Nb obs']
df_combined_stats = pd.DataFrame(index=stats_keys, columns=['Prices (c/bu)', 'Log returns'])

for key in stats_keys:
    df_combined_stats.loc[key, 'Prices (c/bu)'] = original_stats.get(key, np.nan)
    log_key = 'Log ' + key  
    df_combined_stats.loc[key, 'Log returns'] = log_stats.get(log_key, np.nan)
df_combined_stats

## Graphiques


In [None]:
import matplotlib.pyplot as plt
import os 

df.set_index('index', inplace=True)

target_date = '2004-06-17'

prix_17_06_2004 = df.loc[target_date, ['Maturity 1', 'Maturity 2', 'Maturity 3', 'Maturity 4', 'Maturity 5']]

plt.plot(['Maturity 1', 'Maturity 2', 'Maturity 3', 'Maturity 4', 'Maturity 5'], prix_17_06_2004 , marker='o', linestyle='-')
plt.title(f'Term Structure of Corn Futures Prices, {target_date}')
plt.grid()

output_dir = 'static/graph'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
output_path = os.path.join(output_dir, f'term_structure_{target_date}.png')
plt.savefig(output_path)
plt.show()


In [None]:
from src.graph.graph import plot_and_save_graph

plot_and_save_graph(df, ["Maturity 1"], 
                    "Front-month settlement price (cents/bu)", 
                    "Dates", "cents/bu", 
                    output_filename='price_history.png', 
                    output_dir='static/graph')

# Estimation du modele espace d'etat


## Preparation des données

In [None]:
import pandas as pd
import numpy as np
from src.utility.date import get_T, get_t
df.index = pd.to_datetime(df.index)
df[['T1', 'T2', 'T3', 'T4', 'T5']] = pd.DataFrame(df.index.map(lambda x: pd.Series(get_T(x))).tolist(), index=df.index)
for i in range(1, 6):
    df[f'Maturity {i}'] = np.log(df[f'Maturity {i}'])
df = df[(df.index >= '1988-01-01') & (df.index <= '2016-01-01')]

df['t'] = df.index.to_series().apply(get_t)
df

In [None]:
# Charger les données de 2017 pour les prévisions out-of-sample
df_oos = pd.read_pickle('src/data/panel_data.pkl')

# Assurez-vous que l'index est correct
if 'index' in df_oos.columns:
    df_oos.set_index('index', inplace=True)
df_oos.index = pd.to_datetime(df_oos.index, errors='coerce')

df_oos = df_oos[~df_oos.index.isin(outlier_dates)]

# Filtrer les données pour l'année 2017
df_oos = df_oos.loc[(df_oos.index > '2016-01-01') & (df_oos.index <= '2017-01-01')]

# Conversion des colonnes en numérique et application de la transformation logarithmique pour 2017
for col in df_oos.columns:
    df_oos[col] = pd.to_numeric(df_oos[col], errors='coerce')

df_oos[['T1', 'T2', 'T3', 'T4', 'T5']] = pd.DataFrame(df_oos.index.map(lambda x: pd.Series(get_T(x))).tolist(), index=df_oos.index)
for i in range(1, 6):
    df_oos[f'Maturity {i}'] = np.log(df_oos[f'Maturity {i}'])
df_oos['t'] = df_oos.index.to_series().apply(get_t)
df_oos

## Estimation des coefficients de la composante saisonière

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import warnings

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Convertir l'index en datetime
df.index = pd.to_datetime(df.index)

y = df['Maturity 1']

df['Cos1'] = np.cos(2 * np.pi * df.index.dayofyear / 365.25)
df['Sin1'] = np.sin(2 * np.pi * df.index.dayofyear / 365.25)
df['Cos2'] = np.cos(4 * np.pi * df.index.dayofyear / 365.25)
df['Sin2'] = np.sin(4 * np.pi * df.index.dayofyear / 365.25)

X = df[['Cos1', 'Sin1', 'Cos2', 'Sin2']]
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

# Extraire les coefficients estimés
coefficients = model.params
seasonal_coeffs = {
    'coeff_Cos1': coefficients[1],
    'coeff_Sin1': coefficients[2],
    'coeff_Cos2': coefficients[3],
    'coeff_Sin2': coefficients[4]
}

## Estimation du modèle espace d'Etat

In [None]:
import numpy as np
import scipy.stats as stats
from tqdm import tqdm
from src.model.performance import calculate_performance
from src.utility.parameter import calculate_num_parameters
from src.model.optimisation import optimize_model
from src.utility.parameter import initial_guesses

# Paramètres et données d'observation
observations = df.iloc[:, 0:5].values
maturities = df.iloc[:, 6:11].values
times = df['t'].values

# Optimisation et calcul des performances
results = {}
rmse_results = {}
for n_factors in tqdm(range(1, 5)):
    param_keys = ['mu', 'sigma1', 'lambda1', 'kappa2', 'sigma2', 'lambda2', 'rho12',
                  'kappa3', 'sigma3', 'lambda3', 'rho13', 'rho23',
                  'kappa4', 'sigma4', 'lambda4', 'rho14', 'rho24', 'rho34']
    num_params = calculate_num_parameters(n_factors)
    param_keys = param_keys[:num_params]

    guess = initial_guesses[n_factors]

    if len(guess) != len(param_keys):
        raise ValueError(f"Length of initial_guesses ({len(guess)}) does not match length of param_keys ({len(param_keys)})")

    final_result = optimize_model(observations, times, maturities, n_factors, guess, seasonal_coeffs)
    
    results[n_factors] = final_result
    print(f"Optimized parameters for {n_factors} factors:", final_result.x)

    hessian_inv = final_result.hess_inv
    if isinstance(hessian_inv, np.ndarray):
        covariance_matrix = hessian_inv
    else:
        covariance_matrix = hessian_inv.todense()

    try:
        np.linalg.cholesky(covariance_matrix)
        std_errors = np.sqrt(np.diag(covariance_matrix))
    except np.linalg.LinAlgError:
        std_errors = np.full(covariance_matrix.shape[0], np.inf)

    z_values = final_result.x / std_errors
    p_values = [2 * (1 - stats.norm.cdf(np.abs(z))) for z in z_values]

    for i, (param, std_err, p_value) in enumerate(zip(final_result.x, std_errors, p_values)):
        print(f"Parameter {param_keys[i]}: estimate={param}, std_error={std_err}, p_value={p_value}")

    rmse_results[n_factors] = calculate_performance(n_factors, final_result.x, param_keys, observations, times, maturities, seasonal_coeffs)

print("RMSE for in-sample data:")
for n_factors, rmses in rmse_results.items():
    print(f"RMSE for {n_factors} factors: {rmses}")

import pickle
with open('src/data/optimization_results.pkl', 'wb') as f:
    pickle.dump(results, f)

## Estimation du modèle débruité


In [None]:
import numpy as np
import scipy.stats as stats
from tqdm import tqdm
from src.model.performance import calculate_performance
from src.utility.parameter import calculate_num_parameters
from src.model.wavelet import denoise_all_signal 
from src.model.optimisation import optimize_model
from src.utility.parameter import initial_guesses

# Denoising the signal
df_denoised = denoise_all_signal(df, wavelet='db1', level=1)

# Preparing the data
observations = df_denoised.iloc[:, 0:5].values
maturities = df_denoised.iloc[:, 6:11].values
times = df_denoised['t'].values

# Optimisation et calcul des performances
results = {}
rmse_results = {}

for n_factors in tqdm(range(1, 5)):
    param_keys = ['mu', 'sigma1', 'lambda1', 'kappa2', 'sigma2', 'lambda2', 'rho12',
                  'kappa3', 'sigma3', 'lambda3', 'rho13', 'rho23',
                  'kappa4', 'sigma4', 'lambda4', 'rho14', 'rho24', 'rho34']
    num_params = calculate_num_parameters(n_factors)
    param_keys = param_keys[:num_params]

    guess = initial_guesses[n_factors]

    if len(guess) != len(param_keys):
        raise ValueError(f"Length of initial_guesses ({len(guess)}) does not match length of param_keys ({len(param_keys)})")

    final_result = optimize_model(observations, times, maturities, n_factors, guess, seasonal_coeffs)
    
    results[n_factors] = final_result
    print(f"Optimized parameters for {n_factors} factors:", final_result.x)

    hessian_inv = final_result.hess_inv
    if isinstance(hessian_inv, np.ndarray):
        covariance_matrix = hessian_inv
    else:
        covariance_matrix = hessian_inv.todense()

    try:
        np.linalg.cholesky(covariance_matrix)
        std_errors = np.sqrt(np.diag(covariance_matrix))
    except np.linalg.LinAlgError:
        std_errors = np.full(covariance_matrix.shape[0], np.inf)

    z_values = final_result.x / std_errors
    p_values = [2 * (1 - stats.norm.cdf(np.abs(z))) for z in z_values]

    for i, (param, std_err, p_value) in enumerate(zip(final_result.x, std_errors, p_values)):
        print(f"Parameter {param_keys[i]}: estimate={param}, std_error={std_err}, p_value={p_value}")

    rmse_results[n_factors] = calculate_performance(n_factors, final_result.x, param_keys, observations, times, maturities, seasonal_coeffs)

# Affichage des résultats RMSE
print("RMSE for in-sample data:")
for n_factors, rmses in rmse_results.items():
    print(f"RMSE for {n_factors} factors: {rmses}")
    
# Sauvegarde des résultats pour réutilisation
import pickle
with open('src/data/optimization_results_denoised.pkl', 'wb') as f:
    pickle.dump(results, f)


## Out of sample tracking result

In [None]:
import numpy as np
from tqdm import tqdm
from src.model.performance import calculate_performance
from src.utility.parameter import calculate_num_parameters

# Chargement des résultats de l'optimisation
import pickle
with open('src/data/optimization_results.pkl', 'rb') as f:
    results = pickle.load(f)

# Paramètres et données d'observation pour les prévisions out-of-sample
observations = df.iloc[:, 0:5].values
maturities = df.iloc[:, 6:11].values
times = df['t'].values

# Optimisation et calcul des performances out-of-sample
rmse_oos_results = {}

for n_factors in tqdm(range(1, 5)):
    param_keys = ['mu', 'sigma1', 'lambda1', 'kappa2', 'sigma2', 'lambda2', 'rho12',
                  'kappa3', 'sigma3', 'lambda3', 'rho13', 'rho23',
                  'kappa4', 'sigma4', 'lambda4', 'rho14', 'rho24', 'rho34']
    num_params = calculate_num_parameters(n_factors)
    param_keys = param_keys[:num_params]

    # Récupération des paramètres optimisés
    final_result = results[n_factors]
    optimized_params = final_result.x

    # Prévisions à 5 jours à l'avance
    forecast_horizon = 5
    rmse_forecast = [[] for _ in range(observations.shape[1])]  # Initialiser une liste pour chaque maturité

    for t in range(len(times) - forecast_horizon):
        current_observations = observations[:t + forecast_horizon]
        current_times = times[:t + forecast_horizon]
        current_maturities = maturities[:t + forecast_horizon]

        future_observations = observations[t + forecast_horizon]

        # Prévisions pour la date t+5
        predicted_prices = calculate_performance(n_factors, optimized_params, param_keys, current_observations, current_times, current_maturities, seasonal_coeffs)
        actual_prices = future_observations

        # Assurez-vous que predicted_prices et actual_prices sont des tableaux NumPy 2D
        predicted_prices = np.array(predicted_prices)
        actual_prices = np.array(actual_prices)

        # Reshape les tableaux si nécessaire
        if predicted_prices.ndim == 1:
            predicted_prices = predicted_prices.reshape(1, -1)
        if actual_prices.ndim == 1:
            actual_prices = actual_prices.reshape(1, -1)

        # Calcul de la RMSE pour chaque maturité
        for maturity_index in range(observations.shape[1]):
            rmse = np.sqrt(np.mean((predicted_prices[0, maturity_index] - actual_prices[0, maturity_index]) ** 2))
            rmse_forecast[maturity_index].append(rmse)

    # Moyenne des RMSE sur les prévisions à 5 jours pour chaque maturité
    mean_rmse_forecast = [np.mean(rmse_list) for rmse_list in rmse_forecast]
    rmse_oos_results[n_factors] = mean_rmse_forecast

# Affichage des résultats RMSE pour les données out-of-sample
print("RMSE for out-of-sample data (5-day ahead forecasts):")
for n_factors, mean_rmse_list in rmse_oos_results.items():
    print(f"Out-of-sample RMSE for {n_factors} factors:")
    for maturity_index, mean_rmse in enumerate(mean_rmse_list):
        print(f"  Maturity {maturity_index + 1}: {mean_rmse:.2f}%")


## Wavelet Out of sample tracking result

In [None]:
import numpy as np
from tqdm import tqdm
from src.model.performance import calculate_performance
from src.utility.parameter import calculate_num_parameters
from src.model.wavelet import denoise_all_signal

# Chargement des résultats de l'optimisation
import pickle
with open('src/data/optimization_results_denoised.pkl', 'rb') as f:
    results = pickle.load(f)

# Denoising the out-of-sample signal
df_oos_denoised = denoise_all_signal(df_oos, wavelet='db1', level=1)

# Paramètres et données d'observation pour les prévisions out-of-sample
observations_oos = df_oos_denoised.iloc[:, 0:5].values
maturities_oos = df_oos_denoised.iloc[:, 6:11].values
times_oos = df_oos_denoised['t'].values

# Optimisation et calcul des performances out-of-sample
rmse_oos_results = {}

for n_factors in tqdm(range(1, 5)):
    param_keys = ['mu', 'sigma1', 'lambda1', 'kappa2', 'sigma2', 'lambda2', 'rho12',
                  'kappa3', 'sigma3', 'lambda3', 'rho13', 'rho23',
                  'kappa4', 'sigma4', 'lambda4', 'rho14', 'rho24', 'rho34']
    num_params = calculate_num_parameters(n_factors)
    param_keys = param_keys[:num_params]

    # Récupération des paramètres optimisés
    final_result = results[n_factors]
    optimized_params = final_result.x

    # Prévisions à 5 jours à l'avance
    forecast_horizon = 5
    rmse_forecast = [[] for _ in range(observations_oos.shape[1])]  # Initialiser une liste pour chaque maturité

    for t in range(len(times_oos) - forecast_horizon):
        current_observations = observations_oos[:t + forecast_horizon]
        current_times = times_oos[:t + forecast_horizon]
        current_maturities = maturities_oos[:t + forecast_horizon]

        future_observations = observations_oos[t + forecast_horizon]

        # Prévisions pour la date t+5
        predicted_prices = calculate_performance(n_factors, optimized_params, param_keys, current_observations, current_times, current_maturities, seasonal_coeffs)
        actual_prices = future_observations

        # Assurez-vous que predicted_prices et actual_prices sont des tableaux NumPy 2D
        predicted_prices = np.array(predicted_prices)
        actual_prices = np.array(actual_prices)

        # Reshape les tableaux si nécessaire
        if predicted_prices.ndim == 1:
            predicted_prices = predicted_prices.reshape(1, -1)
        if actual_prices.ndim == 1:
            actual_prices = actual_prices.reshape(1, -1)

        # Calcul de la RMSE pour chaque maturité
        for maturity_index in range(observations_oos.shape[1]):
            rmse = np.sqrt(np.mean((predicted_prices[0, maturity_index] - actual_prices[0, maturity_index]) ** 2))
            rmse_forecast[maturity_index].append(rmse)

    # Moyenne des RMSE sur les prévisions à 5 jours pour chaque maturité
    mean_rmse_forecast = [np.mean(rmse_list) for rmse_list in rmse_forecast]
    rmse_oos_results[n_factors] = mean_rmse_forecast

# Affichage des résultats RMSE pour les données out-of-sample
print("RMSE for out-of-sample data (5-day ahead forecasts):")
for n_factors, mean_rmse_list in rmse_oos_results.items():
    print(f"Out-of-sample RMSE for {n_factors} factors:")
    for maturity_index, mean_rmse in enumerate(mean_rmse_list):
        print(f"  Maturity {maturity_index + 1}: {mean_rmse:.2f}%")
