## VAE Hawkes Process Estimation - Tutorial

In [1]:
import os
import sys

import numpy as np
import pandas as pd
import Hawkes as hk
from matplotlib import pyplot as plt

from HAWKES.hawkes import hawkes_simulations, hawkes_simulation
from HAWKES.hyperparameters import hyper_params_simulation
from HAWKES.discretisation import discretise

%load_ext autoreload
%autoreload 2

### Preprocessing

In [2]:
# Training/Validation/Testing dataset generation

# Intensity Decay Parameter (β) = U(p = 1, q = 3)
# Branching Ratio (η) = U(a = 0.05, b = 0.8)
# Expected Activity (E) = 500
# Time Horizon (T) = 100
# Interval Length (∆) = 1
# Number of processes = 160_000

# Hawkes process hyper-parameters generation
train_params, train_alpha, train_beta, train_mu = hyper_params_simulation(filename="train_hawkes_hyperparams.csv")

# Hawkes processes simulations
train_simulated_events_seqs = hawkes_simulations(train_mu, train_alpha, train_beta, filename='train_hawkes_simulations.csv')

# Discrétiser les processus de Hawkes
train_discret_simulated_events_seqs = discretise(train_simulated_events_seqs, filename='train_binned_hawkes_simulations.csv')

In [20]:
import os
import random
import string

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import fastparquet as fp
import VARIABLES.preprocessing_var as prep

# Création de données pour le test
data = {
    'col1': [random.randint(0, 100) for _ in range(1000000)],
    'col2': [random.uniform(0, 1) for _ in range(1000000)],
    'col3': [''.join(random.choices(string.ascii_letters, k=10)) for _ in range(1000000)],
    'col4': [random.choice([True, False]) for _ in range(1000000)]
}

def write_parquet(data, filename):
    
    try:
        # Create a Pandas dataframe from the data dictionary
        df = pd.DataFrame(data)

        # Convert the Pandas dataframe to a PyArrow table
        table = pa.Table.from_pandas(df)

        # Save the PyArrow table to a Parquet file
        with pq.ParquetWriter(os.path.join(prep.FILEPATH, filename), table.schema) as writer:
            writer.write_table(table)
                    
    except IOError as e:
        print(f"Cannot read the file: {e}.")

def write_parquet2(data, filename,  write_index=False, compression=None):
    """
    Save data to a Parquet file using Fastparquet

    Args:
        data (dict): Dictionary to be saved
        filename (str): Name of the file to save data to

    Returns:
        None
    """

    # Save the Pandas dataframe to a Parquet file using Fastparquet
    try:
        # Create a Pandas dataframe from the data dictionary
        fp.write(os.path.join(prep.FILEPATH, filename), pd.DataFrame(data), 
                 write_index=write_index, compression=compression)

    except IOError as e:
        print(f"Cannot write to file: {e}")

# CSV file writing function

def write_csv(data, filename: str = '', mode: str = 'w', encoding: str = 'utf-8') -> None:

    try:
        if not isinstance(data, list):
            data = [data]

        # Written and field names initialisation
        with open(f"{os.path.join(prep.FILEPATH, filename)}", mode=mode, encoding=encoding) as file:
            file.write(','.join(data[0].keys()))
            file.write('\n')
        
            # Lines iteration
            for row in data:
                file.write(','.join(str(x) for x in row.values()))
                file.write('\n')
        
        # Closed file    
        file.close()
                    
    except IOError as e:
        print(f"Cannot read the file: {e}.")


%timeit -r 10 -n 1 write_parquet(data, 'test_pyarrow.pt')
%timeit -r 10 -n 1 write_parquet2(data, 'test_fastparquet.pt')
%timeit -r 10 -n 1 write_csv(data, 'test.csv')

382 ms ± 13.7 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)
341 ms ± 7.22 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)
460 ms ± 5.05 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [11]:
pd.read_csv(os.path.join(prep.FILEPATH, "train_hawkes_simulations.csv"))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.075232,0.214258,0.433004,0.466261,1.051495,1.222863,1.264147,1.376551,1.687387,1.817105,...,17.843018,17.980316,18.123817,18.377844,18.634233,18.738886,19.327684,19.376330,19.460674,19.830723
1,0.212321,0.280106,0.298136,0.377446,0.678517,1.259310,1.323890,1.655072,1.665220,1.789182,...,14.721143,14.845669,14.980983,15.337008,15.713575,15.838463,15.910459,16.110489,16.143108,16.288967
2,0.521453,0.717799,1.449469,2.327657,2.464980,2.514784,2.565648,2.727893,3.095016,4.391651,...,27.681791,27.800257,27.978750,28.039724,28.317173,28.338950,28.447407,28.554770,28.740635,29.435820
3,0.416529,0.473798,0.676673,1.725232,1.841108,1.928380,2.068723,2.086218,2.101057,2.194464,...,24.670874,24.685550,24.689577,24.760384,24.786581,24.934720,25.115370,25.147995,25.396864,25.706055
4,0.541677,0.589448,1.304485,1.306719,1.716082,1.927788,1.953077,2.184153,2.537551,2.603254,...,22.477884,22.915432,23.401283,23.682564,23.712296,23.740320,23.765224,24.493631,24.537977,24.645735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,0.056233,0.669699,0.688259,0.807575,1.277384,1.670343,1.928391,2.722466,2.820992,2.981689,...,18.535767,18.616642,18.724335,18.727667,18.761744,18.901098,18.925518,19.011805,19.319277,19.565487
159996,0.049304,1.834079,2.015447,2.141831,2.238605,2.778455,4.122280,4.330038,4.407625,4.653410,...,29.215939,29.239660,29.755886,29.781507,29.837940,29.844807,29.945670,30.108618,30.342749,30.611872
159997,0.278197,0.491103,0.624112,0.647751,0.648986,0.943302,0.981370,1.067892,1.164237,1.220741,...,15.171423,16.548428,17.066635,19.515068,19.723171,20.067421,20.095463,20.130419,20.496716,20.946672
159998,0.500228,0.545297,1.934724,2.282275,2.552806,2.671431,2.821720,2.831318,2.947640,2.969297,...,20.136528,21.045647,21.289520,21.311716,21.623220,21.659018,21.814835,21.931420,22.314926,22.678570
