In [32]:
import requests
import time
import pandas as pd
from requests.exceptions import HTTPError, Timeout


# I chose the closing price since it provides a better overview of the market by the end of the day. However, 
# I believe that this choice entirely depends on the purpose of the task. For instance, in a market with significant 
# fluctuations throughout the day, a weighted average might offer more insightful information.

# I chose to use BTCIRT and ETHIRT because Nobitex did not have other required data, and accessing additional 
# resources involved too many authentication steps, including selfies. I did not want to provide my data for 
# an entry task, considering that the purpose of this task is different and can be fulfilled with any data.

# API URL
api_url1 = "https://api.nobitex.ir/market/udf/history?symbol=BTCIRT&resolution=1&from=1669843800&to=1701376200"
api_url2 = "https://api.nobitex.ir/market/udf/history?symbol=ETHIRT&resolution=1&from=1669843800&to=1701376200"

def simulate_api_downtime(api):
    print("\nSimulating API Downtime:")
    try:
        # Simulate API Downtime
        response = requests.get(api)
        response.raise_for_status()  # Raise an exception for 4xx and 5xx status codes

        # Process the API response if successful
        data = pd.DataFrame(response.json())
        columns = ['t', 'c', 'v']
        price = data[columns]
        return price

    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")

def simulate_rate_limit_exceeded(api):
    print("\nSimulating Rate Limit Exceeded:")
    try:
        # Simulate Rate Limit Exceeded
        for _ in range(6):  # Exceeding the rate limit (assuming rate limit is 5 requests per second)
            response = requests.get(api)
            response.raise_for_status()
            data = pd.DataFrame(response.json())
            columns = ['t','c', 'v']
            price = data[columns]
            return price
            time.sleep(0.2)  # Introduce a delay between requests

    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")

def simulate_data_inconsistency(api):
    print("\nSimulating Data Inconsistency:")
    try:
        # Simulate Data Inconsistency 
        response = requests.get(api)
        response.raise_for_status()

        data = pd.DataFrame(response.json())
        columns = ['t', 'c', 'v']
        price = data[columns]

        # Check for missing values
        if price.isnull().any().any():
            raise ValueError("DataFrame contains missing values.")

        # Check if all values in each column have the same type
        for column in price.columns:
            if price[column].apply(type).nunique() != 1:
                raise TypeError(f"Column '{column}' has inconsistent data types.")

        # Add more checks as needed...

        print("No data inconsistencies found.")


        # Modify the API response to introduce inconsistency
        
        
        return price

    except requests.exceptions.RequestException as e:
        print(f"Error during API request: {e}")

# Simulate API downtime
print(simulate_api_downtime(api_url1))

# Simulate rate limit exceeded
print(simulate_rate_limit_exceeded(api_url1))

# Simulate data inconsistency
print(simulate_data_inconsistency(api_url1))

# Simulate API downtime
print(simulate_api_downtime(api_url2))

# Simulate rate limit exceeded
print(simulate_rate_limit_exceeded(api_url2))

# Simulate data inconsistency
print(simulate_data_inconsistency(api_url2))


Simulating API Downtime:


              t             c         v
0    1701346200  1.918500e+09  0.001246
1    1701346260  1.919396e+09  0.030425
2    1701346320  1.917735e+09  0.002350
3    1701346380  1.917735e+09  0.019920
4    1701346440  1.916361e+09  0.012854
..          ...           ...       ...
495  1701375900  1.910616e+09  0.026834
496  1701375960  1.913609e+09  0.007366
497  1701376020  1.914271e+09  0.047383
498  1701376080  1.914000e+09  0.013993
499  1701376140  1.915000e+09  0.039812

[500 rows x 3 columns]

Simulating Rate Limit Exceeded:
              t             c         v
0    1701346200  1.918500e+09  0.001246
1    1701346260  1.919396e+09  0.030425
2    1701346320  1.917735e+09  0.002350
3    1701346380  1.917735e+09  0.019920
4    1701346440  1.916361e+09  0.012854
..          ...           ...       ...
495  1701375900  1.910616e+09  0.026834
496  1701375960  1.913609e+09  0.007366
497  1701376020  1.914271e+09  0.047383
498  1701376080  1.914000e+09  0.013993
499  1701376140  1.9150

In [33]:
# Extracting the BCT-ETH pairs from the primary BTCIRT and ETHIRT pairs using "triangular arbitage-cross rate" process.

# Let's say the data retreval was done with no errors. Let's use one of them to get our needed data.

dataBTC = simulate_api_downtime(api_url1)
dataETH = simulate_api_downtime(api_url2)

def cross_rate(df1, df2):
    finalClosedPrice = pd.DataFrame({'closed_price': df1.c.values / df2.c.values, 'open_time': df1.t.values})

    return finalClosedPrice

dataBTCETH =cross_rate(dataBTC, dataETH)

print(dataBTCETH)



Simulating API Downtime:

Simulating API Downtime:
     closed_price   open_time
0       18.429395  1701346200
1       18.496261  1701346260
2       18.480258  1701346320
3       18.480256  1701346380
4       18.467012  1701346440
..            ...         ...
495     18.429783  1701375900
496     18.458654  1701375960
497     18.465637  1701376020
498     18.463023  1701376080
499     18.472788  1701376140

[500 rows x 2 columns]


In [34]:
# 1.2 Resampling: 
# Resampling data by averaging every 5, 20, and 60 rows (note: we lack data for 1440 rows).
# Since the time intervals are equal, TWAP is like a normal average. Additionally, given that only
# the closing price is available and there is no means of obtaining the traded volume for
# BTC-ETC (none that I can think of), VWAP is deemed useless.

class Resampling:
    def __init__(self, resample_to):
        self.resample_to = resample_to

    def resampling(self, df):
        # Make a copy of the DataFrame to avoid modifying the original
        dummy_dataframe = df.copy()

        dummy_dataframe['open_time'] = pd.to_datetime(df['open_time'], unit='s')
        dummy_dataframe.set_index('open_time', inplace=True)

        new_dataframe = dummy_dataframe.resample(self.resample_to).mean()

        return new_dataframe


class ResamplingTo5(Resampling):
    def __init__(self):
        # Unique number for this subclass
        super().__init__('5T')

class ResamplingTo20(Resampling):
    def __init__(self):
        # Unique number for this subclass
        super().__init__('20T')

class ResamplingTo60(Resampling):
    def __init__(self):
        # Unique number for this subclass
        super().__init__('6T')

dataBTCETH_5t_self = ResamplingTo5()
dataBTCETH_5t = dataBTCETH_5t_self.resampling(dataBTCETH)
print(dataBTCETH_5t)


dataBTCETH_20t_self = ResamplingTo20()
dataBTCETH_20t = dataBTCETH_20t_self.resampling(dataBTCETH)
print(dataBTCETH_20t)

dataBTCETH_60t_self = ResamplingTo60()
dataBTCETH_60t = dataBTCETH_60t_self.resampling(dataBTCETH)
print(dataBTCETH_60t)

                     closed_price
open_time                        
2023-11-30 12:10:00     18.470636
2023-11-30 12:15:00     18.448979
2023-11-30 12:20:00     18.435386
2023-11-30 12:25:00     18.442809
2023-11-30 12:30:00     18.439533
...                           ...
2023-11-30 20:05:00     18.436954
2023-11-30 20:10:00     18.430534
2023-11-30 20:15:00     18.483962
2023-11-30 20:20:00     18.474615
2023-11-30 20:25:00     18.457977

[100 rows x 1 columns]
                     closed_price
open_time                        
2023-11-30 12:00:00     18.459808
2023-11-30 12:20:00     18.438541
2023-11-30 12:40:00     18.449367
2023-11-30 13:00:00     18.501283
2023-11-30 13:20:00     18.458225
2023-11-30 13:40:00     18.478223
2023-11-30 14:00:00     18.505371
2023-11-30 14:20:00     18.560873
2023-11-30 14:40:00     18.529450
2023-11-30 15:00:00     18.522984
2023-11-30 15:20:00     18.486893
2023-11-30 15:40:00     18.508039
2023-11-30 16:00:00     18.508069
2023-11-30 16:20:00     

In [35]:
# 1.3 Handling Market Anomalies:
# The good old-school anomaly handling. This step should have been done first thing before other parts.
# But I am just going to do it for the final result for the sake of fulfilling the task needs.

# I cannot just fill it with zeros nor can I drop them since it would cause 
# problems in calculating other time intervals.

# Since the data is minutely and probably not much changes in a 5 min interval
# I am just going to fill missing data using forward-fill.


# Depending on the purpose of our analysis and our goals, various scenarios and anomalies can be addressed, 
# such as data spikes and seasonal changes. However, for the sake of this task, I have opted for 
# the simplest approach.

from scipy.stats import zscore
import numpy as np

def anomaly_handling(df):
    df = df.ffill()

    z_scores = np.abs(zscore(df['closed_price']))
    df_no_outliers = df[(z_scores < 3)]

    return df_no_outliers


no_anomalies = anomaly_handling(dataBTCETH)
print(no_anomalies)

     closed_price   open_time
0       18.429395  1701346200
1       18.496261  1701346260
2       18.480258  1701346320
3       18.480256  1701346380
4       18.467012  1701346440
..            ...         ...
495     18.429783  1701375900
496     18.458654  1701375960
497     18.465637  1701376020
498     18.463023  1701376080
499     18.472788  1701376140

[500 rows x 2 columns]


In [36]:
# 2 EDA
# 2.1 log returns

import numpy as np

def log_reurns(df):
    price_data = df.copy()
    price_data['Log_Return'] = np.log(price_data['closed_price'] / price_data['closed_price'].shift(1))

    return price_data

logReturns = log_reurns(dataBTCETH) 
print(logReturns)

logReturns_5 = log_reurns(dataBTCETH_5t)
print(logReturns_5)

logReturns_20 = log_reurns(dataBTCETH_20t)
print(logReturns_20)

logReturns_60 = log_reurns(dataBTCETH_60t)
print(logReturns_60)

# I did not know prices change more minutely than they do hourly.

     closed_price   open_time    Log_Return
0       18.429395  1701346200           NaN
1       18.496261  1701346260  3.621644e-03
2       18.480258  1701346320 -8.655284e-04
3       18.480256  1701346380 -1.156380e-07
4       18.467012  1701346440 -7.169497e-04
..            ...         ...           ...
495     18.429783  1701375900 -1.565267e-03
496     18.458654  1701375960  1.565266e-03
497     18.465637  1701376020  3.782524e-04
498     18.463023  1701376080 -1.415783e-04
499     18.472788  1701376140  5.287444e-04

[500 rows x 3 columns]
                     closed_price  Log_Return
open_time                                    
2023-11-30 12:10:00     18.470636         NaN
2023-11-30 12:15:00     18.448979   -0.001173
2023-11-30 12:20:00     18.435386   -0.000737
2023-11-30 12:25:00     18.442809    0.000403
2023-11-30 12:30:00     18.439533   -0.000178
...                           ...         ...
2023-11-30 20:05:00     18.436954    0.000358
2023-11-30 20:10:00     18.430534 

In [37]:
# 2.1 Volatility

def EWMA(df, lambda_param = 0.94):

    price_data = df.copy()

    # Calculate EWMA volatility
    price_data['Volatility'] = price_data['Log_Return'].ewm(min_periods=1, alpha=lambda_param, adjust=False).std()

    # Exactly one of com, span, halflife, or alpha must be provided if times is not provided. If times is provided,
    # halflife and one of com, span or alpha may be provided. The choice of which parameter to use 
    # depends on our specific needs and understanding of the time series data. Adjusting these parameters 
    # can impact the sensitivity of the EWMA to recent observations.

    return price_data

calculateEWMA = EWMA(logReturns)
print(calculateEWMA)

calculateEWMA_5 = EWMA(logReturns_5)
print(calculateEWMA_5)


     closed_price   open_time    Log_Return  Volatility
0       18.429395  1701346200           NaN         NaN
1       18.496261  1701346260  3.621644e-03         NaN
2       18.480258  1701346320 -8.655284e-04    0.003173
3       18.480256  1701346380 -1.156380e-07    0.000883
4       18.467012  1701346440 -7.169497e-04    0.000527
..            ...         ...           ...         ...
495     18.429783  1701375900 -1.565267e-03    0.002200
496     18.458654  1701375960  1.565266e-03    0.002159
497     18.465637  1701376020  3.782524e-04    0.000887
498     18.463023  1701376080 -1.415783e-04    0.000464
499     18.472788  1701376140  5.287444e-04    0.000463

[500 rows x 4 columns]
                     closed_price  Log_Return  Volatility
open_time                                                
2023-11-30 12:10:00     18.470636         NaN         NaN
2023-11-30 12:15:00     18.448979   -0.001173         NaN
2023-11-30 12:20:00     18.435386   -0.000737    0.000308
2023-11-30 12:

In [38]:
import plotly.express as px

fig = px.line(calculateEWMA, x='open_time', y='Volatility', title='Volatility Estimation with EWMA',
              labels={'Volatility': 'EWMA Volatility'})

fig.show()

In [39]:
# 2.1 Clustering Analysis

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def Clustering(df, num_clusters = 3):
    price_data = df.copy()
    price_data = price_data.dropna()
    scaler = StandardScaler()
    price_data['ScaledVolatility'] = scaler.fit_transform(price_data[['Volatility']])


    # Create a feature matrix
    X = price_data[['ScaledVolatility']].values

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    price_data['Cluster'] = kmeans.fit_predict(X)

    return price_data

clusterData = Clustering(calculateEWMA)
print(clusterData)

clusterData_5 = Clustering(calculateEWMA_5)
print(clusterData_5)

     closed_price   open_time    Log_Return  Volatility  ScaledVolatility  \
2       18.480258  1701346320 -8.655284e-04    0.003173          0.927759   
3       18.480256  1701346380 -1.156380e-07    0.000883         -0.607753   
4       18.467012  1701346440 -7.169497e-04    0.000527         -0.846087   
5       18.456018  1701346500 -5.954635e-04    0.000141         -1.104871   
6       18.452670  1701346560 -1.814565e-04    0.000298         -0.999913   
..            ...         ...           ...         ...               ...   
495     18.429783  1701375900 -1.565267e-03    0.002200          0.275370   
496     18.458654  1701375960  1.565266e-03    0.002159          0.247923   
497     18.465637  1701376020  3.782524e-04    0.000887         -0.604644   
498     18.463023  1701376080 -1.415783e-04    0.000464         -0.888564   
499     18.472788  1701376140  5.287444e-04    0.000463         -0.889246   

     Cluster  
2          0  
3          2  
4          2  
5          2  








In [40]:
fig = px.scatter(clusterData, x='open_time', y='closed_price', color='Cluster',
                 title='Clustering Analysis with EWMA Volatility',
                 labels={'Close': 'closed_price', 'Cluster': 'Cluster'},
                 range_x=[clusterData['open_time'].min(), clusterData['open_time'].max()])

fig.show()

In [41]:
fig = px.scatter(clusterData_5, x=clusterData_5.index, y='closed_price', color='Cluster',
                 title='Clustering Analysis with EWMA Volatility',
                 labels={'Close': 'closed_price', 'Cluster': 'Cluster'},
                 range_x=[clusterData['open_time'].min(), clusterData['open_time'].max()])

fig.show()

In [42]:
fig = px.box(clusterData, x='Cluster', y='Volatility',
             title='Box Plots of Volatility in Different Clusters')
fig.show()

In [43]:
fig = px.imshow(clusterData[['Volatility']].transpose(),
                labels={'x': 'open_time', 'y': 'Volatility'},
                title='Volatility Heatmap')
fig.show()

In [44]:
# Based on my extensive research, I understand that high volatility means more significant changes in price, 
# indicating greater investment risk. Therefore, the yellow parts are considered unfavorable. However, they 
# can also reveal underlying causes because prices don't change suddenly for no reason. I still believe that 
# understanding the nature of the data (in this case, the BTC-ETH pair price) has a significant impact on 
# how to interpret these figures, and simply noticing them isn't enough.

In [45]:
import scipy.stats as stats

def summary_statistics(df):
    price_data = df.copy()
    
    log_returns_summary = price_data['Log_Return'].describe()

    # Generate summary statistics for volatility
    volatility_summary = price_data['Volatility'].describe()

    # Additional statistics (skewness and kurtosis)
    log_returns_skewness = stats.skew(price_data['Log_Return'].dropna())
    log_returns_kurtosis = stats.kurtosis(price_data['Log_Return'].dropna())

    volatility_skewness = stats.skew(price_data['Volatility'].dropna())
    volatility_kurtosis = stats.kurtosis(price_data['Volatility'].dropna())

    # Print or use the summary statistics as needed
    print("Summary Statistics for Log Returns:")
    print(log_returns_summary)

    print("\nSkewness for Log Returns:", log_returns_skewness)
    print("Kurtosis for Log Returns:", log_returns_kurtosis)

    print("\n\nSummary Statistics for Volatility:")
    print(volatility_summary)

    print("\nSkewness for Volatility:", volatility_skewness)
    print("Kurtosis for Volatility:", volatility_kurtosis)


summary = summary_statistics(calculateEWMA)

Summary Statistics for Log Returns:
count    499.000000
mean       0.000005
std        0.002053
min       -0.007641
25%       -0.000944
50%        0.000000
75%        0.000998
max        0.007537
Name: Log_Return, dtype: float64

Skewness for Log Returns: -0.043747531270133234
Kurtosis for Log Returns: 1.4430093050383643


Summary Statistics for Volatility:
count    498.000000
mean       0.001789
std        0.001493
min        0.000003
25%        0.000661
50%        0.001507
75%        0.002437
max        0.007542
Name: Volatility, dtype: float64

Skewness for Volatility: 1.291648345067256
Kurtosis for Volatility: 1.6301947071842653


In [46]:
import plotly.graph_objects as go
import numpy as np
import scipy.stats as stats

data = calculateEWMA['Log_Return'].dropna()

# Calculate quantiles
quantiles, theoretical_quantiles = stats.probplot(data, fit=False)

# Create a Q-Q plot using Plotly
fig = go.Figure()

# Scatter plot of quantiles
fig.add_trace(go.Scatter(x=theoretical_quantiles, y=quantiles, mode='markers', name='Q-Q Plot'))

# Add a diagonal line for reference
fig.add_shape(go.layout.Shape(
    type='line',
    x0=np.min(theoretical_quantiles),
    y0=np.min(theoretical_quantiles),
    x1=np.max(theoretical_quantiles),
    y1=np.max(theoretical_quantiles),
    line=dict(color='red', width=2)
))

fig.update_layout(title='Q-Q Plot of Log Returns', xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles')
fig.show()

In [47]:
# Generate a plot from a normally ditributed data, our original data is not normal because it does this weird thing in the middle.

# Generate a sample from a normal distribution
np.random.seed(42)
sample = np.random.normal(size=1000)

# Calculate quantiles
quantiles, theoretical_quantiles = stats.probplot(sample, fit=False)

# Create a Q-Q plot using Plotly
fig = go.Figure()

# Scatter plot of quantiles
fig.add_trace(go.Scatter(x=theoretical_quantiles, y=quantiles, mode='markers', name='Q-Q Plot'))

# Add a diagonal line for reference
fig.add_shape(go.layout.Shape(
    type='line',
    x0=np.min(theoretical_quantiles),
    y0=np.min(theoretical_quantiles),
    x1=np.max(theoretical_quantiles),
    y1=np.max(theoretical_quantiles),
    line=dict(color='red', width=2, dash='dash')
))

fig.update_layout(title='Q-Q Plot for a Normal Distribution', xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles')
fig.show()

In [48]:
import scipy.stats as stats


def is_normal(df):
    price_data = df.copy()
    log_returns = price_data['Log_Return'].dropna()

    # Shapiro-Wilk test for normality
    statistic, p_value = stats.shapiro(log_returns)

    # Print the results
    print(f"Shapiro-Wilk Test Statistic: {statistic}")
    print(f"P-value: {p_value}")

    # Interpret the results
    alpha = 0.05  # significance level
    if p_value > alpha:
       print("The sample looks normally distributed (fail to reject the null hypothesis)")
    else:
        print("The sample does not look normally distributed (reject the null hypothesis)")


isNormal = is_normal(calculateEWMA)
isNormal

Shapiro-Wilk Test Statistic: 0.9533731937408447
P-value: 1.894283688241849e-11
The sample does not look normally distributed (reject the null hypothesis)


In [49]:
# Importance of Normality
# Many models including many risk and return models assume our data is normal for simplicity, 
# so probably if our data is not normal this would be a concern.

# Also, non-normal data usually mean that the unpredictable events are more likely to happen
# rather than the predictable and safe ones.

# So, basically I regret not using the UTSD-TMN data bc it was probably normal :)))

In [50]:
# 2.2 Autocorrelation and Stationary Analysis

import plotly.graph_objects as go
from statsmodels.tsa.stattools import acf, pacf
import pandas as pd

def create_acf_pacf_plots(data, column_name):
    # Calculate ACF and PACF
    acf_values = acf(data[column_name], nlags=20)
    pacf_values = pacf(data[column_name], nlags=20)

    # Create ACF Plot
    acf_plot = go.Figure()
    acf_plot.add_trace(go.Bar(x=list(range(len(acf_values))), y=acf_values, name='ACF'))
    acf_plot.update_layout(title=f'ACF Plot for {column_name}', xaxis_title='Lag', yaxis_title='ACF')
    acf_plot.show()

    # Create PACF Plot
    pacf_plot = go.Figure()
    pacf_plot.add_trace(go.Bar(x=list(range(len(pacf_values))), y=pacf_values, name='PACF'))
    pacf_plot.update_layout(title=f'PACF Plot for {column_name}', xaxis_title='Lag', yaxis_title='PACF')
    pacf_plot.show()


# Create ACF and PACF plots for 'Closed'
create_acf_pacf_plots(calculateEWMA, 'closed_price')

# Create ACF and PACF plots for 'LogReturns'
create_acf_pacf_plots(calculateEWMA, 'Log_Return')

# Create ACF and PACF plots for 'Volatility'
create_acf_pacf_plots(calculateEWMA, 'Volatility')


In [51]:
# The peaks of ACF plots are important especially the first one representing the autocorrelation at lag 1. 
# Peaks in the PACF plot represent significant partial autocorrelations, indicating the direct influence 
# of a specific lag on the current observation. 

In [52]:
import statsmodels.api as sm

def adf_test(df, column_name):

    price_data = df.copy()
    price_data = price_data.dropna()
    result = sm.tsa.adfuller(price_data[column_name], autolag='AIC')
    print(f'ADF Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'   {key}: {value}')

    # Interpret the results
    print('\nResults:')
    if result[1] <= 0.05:
        print("The null hypothesis (non-stationary) can be rejected. The time series is likely stationary.")
    else:
        print("Fail to reject the null hypothesis. The time series may not be stationary.")

adf_test(calculateEWMA, 'closed_price')
adf_test(calculateEWMA, 'Log_Return')
adf_test(calculateEWMA, 'Volatility')


ADF Statistic: -3.2619905981609327
p-value: 0.016665375159032034
Critical Values:
   1%: -3.4437936797256317
   5%: -2.867468682890213
   10%: -2.5699277594606915

Results:
The null hypothesis (non-stationary) can be rejected. The time series is likely stationary.
ADF Statistic: -10.477716209230639
p-value: 1.236556875713666e-18
Critical Values:
   1%: -3.443849184997939
   5%: -2.8674931065091105
   10%: -2.569940776113236

Results:
The null hypothesis (non-stationary) can be rejected. The time series is likely stationary.
ADF Statistic: -10.088500953976643
p-value: 1.1385127163361022e-17
Critical Values:
   1%: -3.4436298692815304
   5%: -2.867396599893435
   10%: -2.5698893429241916

Results:
The null hypothesis (non-stationary) can be rejected. The time series is likely stationary.


In [53]:
# Since in the stationary data the statistical traits remai the same throughout the data the correlation 
# between observations at different time lags remains the same. Many models think as time series data as 
# stationary so in order to do some observations we need to change the non-stationary data into stationary.
# In summary Stationary data exhibit well-behaved autocorrelation patterns, while non-stationary data may 
# require transformations to achieve stationarity and simplify the modeling process

In [54]:
import plotly.graph_objects as go

def rolling_cross_correlation_with_lag(df, window_size, lag):
    data = df.copy()
    traces = []

    for col in data.columns:
        if col != 'Log_Return':
            data[f'{"Log_Return"}_{col}_correlation'] = data["Log_Return"].rolling(window=window_size).corr(data[col].shift(lag))

            # Create a scatter plot for each pair
            trace = go.Scatter(
                x=data.index,
                y=data[f'{"Log_Return"}_{col}_correlation'],
                mode='lines',
                name=f'{"Log_Return"} vs {col}'
            )
            traces.append(trace)

    return traces

# Set the rolling window size
window_size = 30

# Set the lag for one-time interval. zero for synchronous Correlation and 1 for Lagged correlation.
lag = 0
cross_correlation_traces = rolling_cross_correlation_with_lag(calculateEWMA, window_size, lag)

# Create layout
layout = go.Layout(
    title=f'Rolling Cross-Correlation with Lag of {lag} Day(s) (Window Size: {window_size} days)',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Cross-Correlation'),
)

# Create figure
fig = go.Figure(data=cross_correlation_traces, layout=layout)

# Show the figure
fig.show()


lag = 1
cross_correlation_traces_1 = rolling_cross_correlation_with_lag(calculateEWMA, window_size, lag)

# Create layout
layout = go.Layout(
    title=f'Rolling Cross-Correlation with Lag of {lag} Day(s) (Window Size: {window_size} days)',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Cross-Correlation'),
)

# Create figure
fig = go.Figure(data=cross_correlation_traces_1, layout=layout)

# Show the figure
fig.show()

In [55]:
import numpy as np

new_df = calculateEWMA.copy()
new_df = new_df.dropna()
correlation_log_returns_prices = np.corrcoef(new_df['Log_Return'], new_df['closed_price'])
correlation_log_returns_times = np.corrcoef(new_df['Log_Return'].values, new_df['open_time'].values)
correlation_log_returns_volatilities = np.corrcoef(new_df['Log_Return'].values, new_df['Volatility'].values)

print(f"Correlation with Prices:\n {correlation_log_returns_prices}")
print(f"Correlation with Times:\n {correlation_log_returns_times}")
print(f"Correlation with Volatilities:\n {correlation_log_returns_volatilities}")

Correlation with Prices:
 [[1.         0.40834946]
 [0.40834946 1.        ]]
Correlation with Times:
 [[ 1.00000000e+00 -7.22112916e-04]
 [-7.22112916e-04  1.00000000e+00]]
Correlation with Volatilities:
 [[ 1.        -0.0040097]
 [-0.0040097  1.       ]]


In [56]:
# Since corelation does not mean causation and I don't know what type of goal are we pursuing here I am gonna leave the 
# code as it is for now.

In [57]:
# Seriously? 4 tasks? :(((
# If someone is reading this, go get hydrated and make your spine happy by walking or doing a speedy yoga, LOL :D

In [58]:
# 3 Cointegration Analysis

# Apparantly If my data is already stationary, it implies that there is no need for cointegration testing 
# because the variables are not exhibiting long-term relationships that need to be adjusted for. and the 
# data provided here is stationary so no need of this task LOL :))))

# if we wanted to analysepercentage chage we could use log-transformed price, also it can help achieve stationarity 
# if the original data is non-stationary.

# Also, both here and in Task 2, I did not understand why I had three prices for one pair and assumed I needed to 
# consider different time windows. Apparently, that was not the case, so I am just going to put the code here.


In [59]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import plotly.graph_objects as go

# The choice of cointegration analysis method depends on the nature of the data, number of time series involved
# and our goal. For now the introduced ADF sounds like an okay test.

result_pairwise_1_2 = adfuller(calculateEWMA['closed_price'].head(100).values - calculateEWMA_5['closed_price'].values)
print('\nADF Statistic for Residuals (Price1 - Price2):', result_pairwise_1_2[0])
print('p-value for Residuals (Price1 - Price2):', result_pairwise_1_2[1])
print('Critical Values for Residuals (Price1 - Price2):', result_pairwise_1_2[4])




ADF Statistic for Residuals (Price1 - Price2): -2.9560087625902676
p-value for Residuals (Price1 - Price2): 0.03921189214204608
Critical Values for Residuals (Price1 - Price2): {'1%': -3.4989097606014496, '5%': -2.891516256916761, '10%': -2.5827604414827157}


In [60]:
import pandas as pd
from statsmodels.tsa.stattools import adfuller

# Assuming you have a DataFrame df with your time series data
# Load your data or replace this with your actual DataFrame
# df = pd.read_csv('your_data.csv', parse_dates=True, index_col='Date')

# Ensure that the DataFrame has a datetime index
df = calculateEWMA.copy()
df = df.dropna()
df.index = pd.to_datetime(df.index)

# Define the frequency of segmentation (e.g., 'M' for monthly)
segment_frequency = 'M'

# Split the dataset into monthly segments
monthly_segments = [group for _, group in df.groupby(pd.Grouper(freq=segment_frequency))]

# Perform cointegration tests on each segment
for i, segment in enumerate(monthly_segments):
    print(f"Segment {i + 1}:")
    
  
    
    # Perform Johansen cointegration test
    result = adfuller(segment['closed_price'])
    # result2 = adfuller(calculateEWMA['closed_price'])
    print('\nADF Statistic:', result[0])
    print('p-value:', result[1])
    print('Critical Values:', result[4])
    print("\n")


Segment 1:

ADF Statistic: -3.2619905981609327
p-value: 0.016665375159032034
Critical Values: {'1%': -3.4437936797256317, '5%': -2.867468682890213, '10%': -2.5699277594606915}




In [61]:
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm


# Ensure that the DataFrame has a datetime index
df = calculateEWMA.copy()
df.index = pd.to_datetime(df.index)

# Define the frequency of segmentation (e.g., 'M' for monthly)
segment_frequency = 'M'

# Split the dataset into monthly segments
monthly_segments = [group for _, group in df.groupby(pd.Grouper(freq=segment_frequency))]

# Function to perform Engle-Granger Two-Step Cointegration Test
def engle_granger_cointegration_test(segment):
    # Take the first difference
    stationary_segment = segment.diff().dropna()

    # Regress one time series on the other and check if the residuals are stationary
    y = stationary_segment[stationary_segment.columns[0]]
    x = stationary_segment[stationary_segment.columns[1]]
    x = sm.add_constant(x)  # Add a constant term for the intercept
    model = sm.OLS(y, x)
    results = model.fit()
    residuals = results.resid

    result_residual = adfuller(residuals)
    print('ADF Test for Residuals:')
    print(f'ADF Statistic: {result_residual[0]}')
    print(f'p-value: {result_residual[1]}')
    print(f'Critical Values: {result_residual[4]}')
    print("\n")

# Perform Engle-Granger Two-Step Cointegration Test for each monthly segment
for i, segment in enumerate(monthly_segments):
    print(f"Segment {i + 1}:")
    engle_granger_cointegration_test(segment)


Segment 1:
ADF Test for Residuals:
ADF Statistic: -10.55430629725934
p-value: 8.025876302095002e-19
Critical Values: {'1%': -3.4438771098680196, '5%': -2.867505393939065, '10%': -2.569947324764179}




In [75]:
import pandas as pd
from statsmodels.tsa.stattools import coint


# Assuming your DataFrame has a 'Date' column, make sure it's in datetime format
df = calculateEWMA.copy()
df['open_time'] = pd.to_datetime(df['open_time'])

# Set the 'Date' column as the index
df.set_index('open_time', inplace=True)

monthly_segments = df.resample('M')


# Now, you can iterate over the monthly groups or access a specific month
if len(monthly_segments) >= 2:
    consecutive_month_pairs = zip(monthly_segments.resample('M'), monthly_segments.resample('M')[1:])
    for (name1, group1), (name2, group2) in consecutive_month_pairs:
        print(f"Pair: {name1} and {name2}")
        # Your further analysis code here
else:
    print("Not enough data for consecutive months.")
    
# Perform cointegration analysis for each pair of consecutive months
for (name1, group1), (name2, group2) in consecutive_month_pairs:
    # Assuming 'Series1' and 'Series2' are the columns you want to test for cointegration
    result = coint(group1['Series1'], group1['Series2'])

    # Extract cointegration coefficients
    alpha, beta = result[:2]

    # Print the results
    print(f"Cointegration analysis between {name1} and {name2}")
    print(f"Alpha (intercept): {alpha}")
    print(f"Beta (coefficient): {beta}")

    # Add a line break for better readability
    print('-' * 40)

Not enough data for consecutive months.


NameError: name 'consecutive_month_pairs' is not defined

In [None]:
# 4 Error Correction Model
# Again, since I don't have enough data I just provide the code and hope it works :)))
# Also my data is stationary so this ECM isn't even neccessary but anyways.

import numpy as np

def estimate_ecm_parameters(Y, X, lag=1):
    """
    Estimate ECM parameters for a cointegrated pair (Y, X).

    Parameters:
    - Y: Dependent variable (numpy array)
    - X: Independent variable (numpy array)
    - lag: Number of lags for the differences (default is 1)

    Returns:
    - alpha: Adjustment coefficient
    - beta: Cointegration coefficient
    - gamma: Coefficients of lagged differences
    """

    # Take differences
    diff_Y = np.diff(Y, lag)
    diff_X = np.diff(X, lag)

    # Lagged differences
    lag_diff_Y = diff_Y[:-lag]
    lag_diff_X = diff_X[:-lag]

    # Stack lagged differences
    lagged_diffs = np.column_stack((lag_diff_Y, lag_diff_X))

    # Add a constant for intercept
    lagged_diffs_with_const = np.column_stack((np.ones_like(lag_diff_Y), lagged_diffs))

    # Estimate ECM parameters using OLS
    params = np.linalg.lstsq(lagged_diffs_with_const, diff_Y[lag:], rcond=None)[0]

    # Extract parameters
    alpha = params[0]
    beta = params[1]
    gamma = params[2:]

    return alpha, beta, gamma

def error_correction_model(Y, X, alpha, beta, gamma):
    """
    Implement the Error Correction Model.

    Parameters:
    - Y: Dependent variable (numpy array)
    - X: Independent variable (numpy array)
    - alpha: Adjustment coefficient
    - beta: Cointegration coefficient
    - gamma: Coefficients of lagged differences

    Returns:
    - Residuals: Model residuals
    """

    # Take differences
    diff_Y = np.diff(Y)
    diff_X = np.diff(X)

    # Construct lagged differences
    lag_diff_Y = np.roll(diff_Y, shift=1)
    lag_diff_X = np.roll(diff_X, shift=1)

    # First differences for the lagged differences
    diff_lag_diff_Y = np.diff(lag_diff_Y)
    diff_lag_diff_X = np.diff(lag_diff_X)

    # Calculate the error correction term
    ecm_term = alpha * (Y[:-1] - beta * X[:-1])

    # Calculate the predicted differences using the estimated coefficients
    predicted_diff_Y = alpha + beta * X[:-1] + gamma[0] * lag_diff_Y + gamma[1] * lag_diff_X

    # Model residuals
    residuals = diff_Y[1:] - predicted_diff_Y - ecm_term[1:] + diff_lag_diff_Y + gamma[2] * diff_lag_diff_X

    return residuals

# Example usage:
# Assuming Y and X are numpy arrays representing time series data
# Y = ...
# X = ...

# Estimate ECM parameters
alpha_hat, beta_hat, gamma_hat = estimate_ecm_parameters(Y, X)

# Implement ECM
residuals = error_correction_model(Y, X, alpha_hat, beta_hat, gamma_hat)


In [None]:
# Thank you for the insightful entry task! It was a great learning experience, and I'm excited about the 
# possibility of contributing to financial data analysis in this company. While I may not know everything, 
# I'm eager to learn and be part of your dynamic team at Hermes Capital. Looking forward to the next steps!