# ENSO Analysis

## Dependencias

In [5]:
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
import warnings

warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)


from umap import UMAP
# General Software Dependencies
import pickle
import pathlib
import datetime as dt
from datetime import datetime

# Mathematical Dependencies
import numpy as np
import pandas as pd
import gudhi as gd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from ipywidgets import widgets
from IPython.display import display


# Dependencias específicas
from gtda.time_series import SingleTakensEmbedding, takens_embedding_optimal_parameters
from gtda.homology import VietorisRipsPersistence, WeakAlphaPersistence
from gtda.diagrams import BettiCurve
from sklearn.decomposition import PCA
from gtda.plotting import plot_point_cloud
from sklearn.ensemble import IsolationForest
from sklearn.manifold import Isomap, LocallyLinearEmbedding, SpectralEmbedding, TSNE, MDS
from sklearn.decomposition import PCA


# Plotly parameters
# Calculate the desired width and height ratio
width_ratio = 10
height_ratio = 10
# Calculate the desired width based on the height and ratio
desired_height = 450  # Choose an appropriate height value
desired_width = desired_height * width_ratio / height_ratio
margin=dict(l=30, r=30, t=30, b=30)

## Funciones auxiliares

In [2]:
def parse_space_separated_file_to_dataframe(filename):
    data = []
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespaces
            if line:  # Skip empty lines
                items = line.split()  # Split line into separate items
                year = items[0]  # First item is the year
                months = items[1:]  # Remaining items are months
                data.append([year] + months)  # Append year and months to data list
    
    df = pd.DataFrame(data, columns=["Year"] + list(range(1, 13)))
    return df


def parse_general_file(file_name):
    data = []
    with open(file_name, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespaces
            if line:  # Skip empty lines
                items = line.split()  # Split line into separate items
                year = items[0]  # First item is the year
                months = items[1:]  # Remaining items are months
                data.append([year] + months)  # Append year and months to data list

    gen_df = pd.DataFrame(data)
    gen_df.columns = gen_df.iloc[0]
    gen_df = gen_df[1:]
    gen_df['date'] = pd.to_datetime(gen_df["YR"].astype(str) + "-" + gen_df["MON"].astype(str), format="%Y-%m")
    gen_df.set_index('date', inplace=True)
    gen_df.columns = ['year', 'month', 'nino1.2', 'anom_nino1.2',
                  'nino3', 'anom_nino3','nino4', 'anom_nino4',
                  'nino3.4', 'anom_nino3.4']
    
    gen_df = gen_df.apply(pd.to_numeric)
    gen_df = gen_df.sort_index()

    return gen_df


def fit_embedder(embedder: SingleTakensEmbedding, y: np.ndarray, verbose: bool=True):
    """Fits a Takens embedder and displays optimal search parameters."""
    
    y_embedded = embedder.fit_transform(y)
    delay = embedder.time_delay_

    if verbose:
        print(f"Shape of embedded time series: {y_embedded.shape}")
        print(
            f"Optimal embedding dimension is {embedder.dimension_} and time delay is {delay:.4f}"
        )

    return y_embedded, delay


def periodicity_analysis(max_embedding_dimension: int, 
                         max_time_delay: int, 
                         stride: int, 
                         y: pd.Series, 
                         var_name: str,
                         cluster: callable, 
                         persistence: callable):
    

    embedder = SingleTakensEmbedding(
        parameters_type="search",
        n_jobs=-1,
        time_delay=max_time_delay,
        dimension=max_embedding_dimension,
        stride=stride,
    )
    y_embedded, delay = fit_embedder(embedder, y)
    print(f"Time delay: {delay}")

    fig = go.Figure(data=go.Scatter(
                        x=y.index.astype(str), 
                        y=y,
                        marker_color='indianred', 
                        text=f"{var_name} idx")
                    )
    fig.update_layout({"title": f'Time series for: {var_name}',
                    "xaxis": {"title":"Date"},
                    "yaxis": {"title":f"{var_name} idx"},
                    "showlegend": False,
                    "width": desired_width*2.11,
                    "height": desired_height})
    
    fig.show()
        
    y_embedded_trans = cluster.fit_transform(y_embedded)
    proj_series_fig = plot_point_cloud(y_embedded_trans[:, :3])

    margin=dict(l=20, r=20, t=30, b=30)
    proj_series_fig.update_layout(
        title=f'Encaje de {var_name}',
        width=desired_width,
        height=desired_height,
        margin=margin
        )
    
    proj_series_fig.show()

    pers_vals = persistence.fit_transform(y_embedded_trans[None, :, :])
    pers_fig = persistence.plot(pers_vals)
    pers_fig.update_layout(
        title=f'Diagrama de Persistencia de {var_name}',
        height=desired_height
    )

    pers_fig.show()

    bettis = BettiCurve()
    bet_vals = bettis.fit_transform(pers_vals)
    betti_fig = bettis.plot(bet_vals)
    betti_fig.update_layout(
        title=f'Curvas de Betti de {var_name}',
        width=desired_width,
        height=desired_height
        )
    
    betti_fig.show()

## Lectura de datos

In [3]:
df = parse_space_separated_file_to_dataframe('db/nino34.long.anom.data.txt')
df = df.melt(id_vars=["Year"], var_name="Month", value_name='anom_nino3.4')
df["date"] = pd.to_datetime(df["Year"].astype(str) + "-" + df["Month"].astype(str), format="%Y-%m")
df = df[['date', 'anom_nino3.4']]
df = df.set_index('date')
df['anom_nino3.4'] = df['anom_nino3.4'].astype(float)
df = df.sort_index()

gen_df = parse_general_file('db/sstoi.indices.txt')


## Análisis de periodicidad

### Nino 1.2

In [11]:
var = 'nino1.2'
delta = 15
start = 1980  - 6*delta
dt_start = datetime(1980, 1, 1)
dt_end = datetime(1995, 1, 1)
data = gen_df[var] # [dt_start:dt_end][var]


max_embedding_dimension = 10
max_time_delay = 10
stride = 1
um  = UMAP(random_state=0, n_components=4)
se = SpectralEmbedding(n_components=4, random_state=0)
y = data
var_name = var
persistence = VietorisRipsPersistence(
    homology_dimensions=[0, 1, 2],
    n_jobs=-1
)

periodicity_analysis(max_embedding_dimension=max_embedding_dimension,
                     max_time_delay=max_time_delay,
                     stride=stride,
                     y=y,
                     var_name=var_name,
                     cluster=se,
                     persistence=persistence
                     )

Shape of embedded time series: (475, 8)
Optimal embedding dimension is 8 and time delay is 3.0000
Time delay: 3


### Anomalías Nino 1.2

In [26]:
# 1980, 1995
var = 'anom_nino1.2'
dt_start = datetime(2000, 1, 1)
dt_end = datetime(2020, 1, 1)
data = gen_df[var] #[dt_start:dt_end][var]
data = gen_df[dt_start:dt_end][var]

#UMAP
max_embedding_dimension = 10
max_time_delay = 10
stride = 1
um  = UMAP(random_state=0, n_components=4)
y = data
var_name = var
persistence = VietorisRipsPersistence(
    homology_dimensions=[0, 1, 2],
    n_jobs=-1
)

periodicity_analysis(max_embedding_dimension=max_embedding_dimension,
                     max_time_delay=max_time_delay,
                     stride=stride,
                     y=y,
                     var_name=var_name,
                     cluster=um,
                     persistence=persistence)

Shape of embedded time series: (231, 6)
Optimal embedding dimension is 6 and time delay is 2.0000
Time delay: 2


### Niño 3

In [6]:
var = 'nino3'
delta = 15
start = 1980  - 6*delta
dt_start = datetime(1992, 1, 1)
dt_end = datetime(2005, 1, 1)
data = gen_df[dt_start:dt_end][var]


max_embedding_dimension = 10
max_time_delay = 10
stride = 1
um  = UMAP(random_state=0, n_components=4)
se = SpectralEmbedding(n_components=4, random_state=0)
y = data
var_name = var
persistence = VietorisRipsPersistence(
    homology_dimensions=[0, 1, 2],
    n_jobs=-1
)

periodicity_analysis(max_embedding_dimension=max_embedding_dimension,
                     max_time_delay=max_time_delay,
                     stride=stride,  
                     y=y,
                     var_name=var_name,
                     cluster=um,
                     persistence=persistence)

Shape of embedded time series: (152, 6)
Optimal embedding dimension is 6 and time delay is 1.0000
Time delay: 1


### Anomalías Nino 3

In [9]:
var = 'anom_nino3'
dt_start = datetime(1980, 1, 1)
dt_end = datetime(1995, 1, 1)
data = gen_df[dt_start:dt_end][var]

#UMAP
max_embedding_dimension = 10
max_time_delay = 10
stride = 1
um  = UMAP(random_state=0, n_components=4)
y = data
var_name = var
persistence = VietorisRipsPersistence(
    homology_dimensions=[0, 1, 2],
    n_jobs=-1
)

periodicity_analysis(max_embedding_dimension=max_embedding_dimension,
                     max_time_delay=max_time_delay,
                     stride=stride,
                     y=y,
                     var_name=var_name,
                     cluster=um,
                     persistence=persistence
                     )

Shape of embedded time series: (139, 7)
Optimal embedding dimension is 7 and time delay is 3.0000
Time delay: 3


### Nino 4

In [13]:
# 1980, 1995
var = 'nino4'
dt_start = datetime(1980, 1, 1)
dt_end = datetime(1997, 1, 1)
data = gen_df[dt_start:dt_end][var]

max_embedding_dimension = 10
max_time_delay = 10
stride = 1
um  = UMAP(random_state=0, n_components=4)
y = data
var_name = var
persistence = VietorisRipsPersistence(
    homology_dimensions=[0, 1, 2],
    n_jobs=-1
)


periodicity_analysis(max_embedding_dimension=max_embedding_dimension,
                     max_time_delay=max_time_delay,
                     stride=stride,
                     y=y,
                     var_name=var_name,
                     cluster=um,
                     persistence=persistence)

Shape of embedded time series: (163, 7)
Optimal embedding dimension is 7 and time delay is 3.0000
Time delay: 3


### Anomalías Nino 4

In [14]:
# 1980, 1995
var = 'anom_nino4'
data = gen_df[var]#[dt_start:dt_end][var]

max_embedding_dimension = 10
max_time_delay = 10
stride = 1
um  = UMAP(random_state=0, n_components=4)
y = data
var_name = var
persistence = VietorisRipsPersistence(
    homology_dimensions=[0, 1, 2],
    n_jobs=-1
)

periodicity_analysis(max_embedding_dimension=max_embedding_dimension,
                     max_time_delay=max_time_delay,
                     stride=stride,
                     y=y,
                     var_name=var_name,
                     cluster=um,
                     persistence=persistence)

Shape of embedded time series: (426, 8)
Optimal embedding dimension is 8 and time delay is 10.0000
Time delay: 10


### Nino 3.4

In [20]:
# 1980, 1995
var = 'nino3.4'
dt_start = datetime(1982, 1, 1)
dt_end = datetime(2006, 1, 1)
data = gen_df[dt_start:dt_end][var]
max_embedding_dimension = 10
max_time_delay = 13
stride = 1
um  = UMAP(random_state=0, n_components=4)
y = data
var_name = var
persistence = VietorisRipsPersistence(
    homology_dimensions=[0, 1, 2],
    n_jobs=-1
)

periodicity_analysis(max_embedding_dimension=max_embedding_dimension,
                     max_time_delay=max_time_delay,
                     stride=stride,
                     y=y,
                     var_name=var_name,
                     cluster=um,
                     persistence=persistence)

Shape of embedded time series: (265, 7)
Optimal embedding dimension is 7 and time delay is 4.0000
Time delay: 4


### Anomalías Nino 3.4

In [16]:
# 1980, 1995
var = 'anom_nino3.4'
delta = 15
start = 1980  - 6*delta
dt_start = datetime(1980, 1, 1)
dt_end = datetime(1995, 1, 1)
data = gen_df[var]#[dt_start:dt_end][var]
data = gen_df[dt_start:dt_end][var]

max_embedding_dimension = 10
max_time_delay = 10
stride = 1
um  = UMAP(random_state=0, n_components=4)
y = data
var_name = var
persistence = VietorisRipsPersistence(
    homology_dimensions=[0, 1, 2],
    n_jobs=-1
)

periodicity_analysis(max_embedding_dimension=max_embedding_dimension,
                     max_time_delay=max_time_delay,
                     stride=stride,
                     y=y,
                     var_name=var_name,
                     cluster=um,
                     persistence=persistence)

Shape of embedded time series: (149, 5)
Optimal embedding dimension is 5 and time delay is 2.0000
Time delay: 2
