In [15]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv('./extracted/data/q3/q3_data.csv')
df.rename(columns={'Unnamed: 0':'Date'},inplace=True)
df.set_index('Date', inplace=True)
df.index = pd.to_datetime(df.index)


In [17]:
df['A-B'] = df['A']- df['B'] #tracking the A - B because that tracks convergence of A and B in a single time series... just makes it easier to read and do functions 

In [18]:
# Convergent flag
##  zscore: used as a basis to see how far it diverges from the mean -> (x - mu)/std
## decided +- 0.5 z score as it shoudl include most fluctuations fo the time series
mean = df['A-B'].mean()
df['std'] = df['A-B'].std()
df['z_score'] = (df['A-B'] - mean) / df['std'] 
df['convergent'] = df['z_score'].between(-0.5, 0.5).astype(int)

# starts/ends dates using shift logic
sh = df['convergent'].shift(1, fill_value=0)
df['Start'] = ((df['convergent'] == 1) & (sh == 0)).astype(int)
df['End']   = ((df['convergent'] == 0) & (sh == 1)).astype(int)

# Extract the dates where start is 1 and end is 1
starts = df.index[df['Start'].eq(1)]
ends   = df.index[df['End'].eq(1)]

# edge case if it starts and does not end, just give last date in df frame
if len(ends) < len(starts):
    ends = ends.append(pd.Index([df.index[-1]]))

# building res dataframe with answers
convergence_df = pd.DataFrame({'Start': starts[:len(ends)], 'End': ends}).reset_index(drop=True)

# this is for 4.2 for average days
convergence_df['Days'] = (convergence_df['End'] - convergence_df['Start']).dt.days
average_days = convergence_df['Days'].mean()

In [19]:
# iterate through each row and find the maximum abs(z_score)
for index, row in convergence_df.iterrows():
    start = row['Start']
    end = row['End']
    max_abs_z = df.loc[start:end, 'z_score'].abs().max()
    convergence_df.loc[index,'Greatest Distance'] = max_abs_z

In [20]:
# iterate through A and B and find greatest volatility 
# I used this method rather then max - min, because I want total total movement rather then one spike
for index, row in convergence_df.iterrows():
    start = row['Start']
    end = row['End']
    A_seg = df.loc[start:end, 'A']
    B_seg = df.loc[start:end, 'B']
    volA = A_seg.pct_change().std()
    volB = B_seg.pct_change().std()
    convergence_df.loc[index,'Driving'] = 'A' if volA > volB else 'B' if volA < volB else 'Both'


# Question 1,3,4 answer is below
*Variable is named convergence_df, whith all convergent series (zscore in range +-0.5), gretest distance of a and b in each series, and lastly which series is driving convergence*

In [23]:
convergence_period = convergence_df[['Start', 'End']].values
average_convergence_eriod = average_days.round(2)
greates_distance_per_period = convergence_df['Greatest Distance'].values
who_is_driving = convergence_df['Driving'].values

# Question 2 answer below
*Answer is rounded to 2 digits*

In [22]:
average_days.round(2)

np.float64(6.68)

In [24]:
convergence_df

Unnamed: 0,Start,End,Days,Greatest Distance,Driving
0,2001-01-17,2001-01-22,5,0.858032,B
1,2001-01-23,2001-01-31,8,0.790701,B
2,2001-02-05,2001-02-14,9,0.813886,B
3,2001-02-21,2001-03-01,8,0.607836,B
4,2001-04-03,2001-04-05,2,0.581547,A
...,...,...,...,...,...
612,2025-07-28,2025-07-29,1,1.071349,Both
613,2025-07-31,2025-08-01,1,1.289051,Both
614,2025-08-15,2025-08-20,5,0.798927,A
615,2025-10-27,2025-10-29,2,0.603078,A
