In [4]:
import matplotlib.pyplot as plt
import numpy as np


s1 = np.array([0., 0, 1, 2, 1, 0, 1, 0, 0, 2, 1, 0, 0])  
s2 = np.array([0., 1, 2, 3, 1, 0, 0, 0, 2, 1, 0, 0, 0])  
s3 = np.array([1., 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])  

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

ax.plot(s1, label='s1')
ax.plot(s2, label='s2')
ax.plot(s3, label='s3')
plt.legend()

In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.append('src')
from dtaidistance import dtw
from helper.time_series import convert_to_time_series_df
from cluster.methods import get_dtw_dist, get_dtw_clusters

s1 = np.array([0., 0, 1, 2, 1, 0, 1, 0, 0, 2, 1, 0, 0])  
s2 = np.array([0., 1, 2, 3, 1, 0, 0, 0, 2, 1, 0, 0, 0])  
s3 = np.array([1., 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])  

print("Time Series:")
print(f"s1: {s1}")
print(f"s2: {s2}")

basic_distance = dtw.distance(s1, s2)
print(f"\nBasic DTW distance: {basic_distance}")

# Create dummy data for testing
dummy_data = []
dates = pd.date_range('2024-01-07', periods=13, freq='W-SUN')

for i, (date, value) in enumerate(zip(dates, s1)):
    dummy_data.append({
        'PRODUCT_ID': 1001,
        'REGION': 'REGION_1', 
        'CLUSTER': 1,
        'WEEK_START': date,
        'QTY': value,
        'WK_OF_YEAR': i+1
    })

for i, (date, value) in enumerate(zip(dates, s2)):
    dummy_data.append({
        'PRODUCT_ID': 1002,
        'REGION': 'REGION_2',
        'CLUSTER': 2, 
        'WEEK_START': date,
        'QTY': value,
        'WK_OF_YEAR': i+1
    })

for i, (date, value) in enumerate(zip(dates, s3)):
    dummy_data.append({
        'PRODUCT_ID': 1003,
        'REGION': 'REGION_3',
        'CLUSTER': 3,
        'WEEK_START': date,
        'QTY': value,
        'WK_OF_YEAR': i+1
    })

dummy_df = pd.DataFrame(dummy_data)
print(f"\nDummy DataFrame shape: {dummy_df.shape}")

# Convert to time series format
ml_ts, *_ = convert_to_time_series_df(dummy_df)
print(f"Time series matrix shape: {ml_ts.shape}")
print(f"Columns: {ml_ts.columns.tolist()}")

dtw_params = {'window': 4, 'max_it': 10, 'max_dba_it': 10, 'use_parallel': False}

dtw_distance_matrix = get_dtw_dist(ml_ts, dtw_params)
print(f"\nDTW Distance Matrix:")
print(dtw_distance_matrix)

# Perform clustering with 2 clusters
cluster_result, params = get_dtw_clusters(
    ml_ts=ml_ts,
    number_of_components=2,
    **dtw_params
)

print(f"\nClustering Result:")
for cluster_id, members in cluster_result.items():
    print(f"Cluster {cluster_id}: {members}")

In [3]:
# Jupyter notebook'ta şu kodu çalıştır:

import numpy as np
import pandas as pd
import sys
sys.path.append('src')
from dtaidistance import dtw
from cluster.methods import get_dtw_dist, get_dtw_clusters, silhouette_coefficient

s1 = np.array([0., 0, 1, 2, 1, 0, 1, 0, 0, 2, 1, 0, 0])  # Series 1
s2 = np.array([0., 1, 2, 3, 1, 0, 0, 0, 2, 1, 0, 0, 0])  # Series 2
s3 = np.array([1., 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])  # Constant series

print("Time Series:")
print(f"s1: {s1}")
print(f"s2: {s2}")
print(f"s3: {s3}")

# 1. Basic DTW distance calculation
basic_distance = dtw.distance(s1, s2)
print(f"\nBasic DTW distance: {basic_distance}")

# 2. Convert to modern DataFrame format (current ad data structure)
# Create dummy data using current parameter names
dummy_data = []
dates = pd.date_range('2024-01-01', periods=13, freq='D')  # Daily dates

# s1 data - Account1 Campaign1
for i, (date, value) in enumerate(zip(dates, s1)):
    dummy_data.append({
        'account_name': 'TestAccount1',
        'campaign': 'TestCampaign1',
        'date': date,
        'conversions_rolling_5d_zscore': value,
    })

# s2 data - Account1 Campaign2  
for i, (date, value) in enumerate(zip(dates, s2)):
    dummy_data.append({
        'account_name': 'TestAccount1',
        'campaign': 'TestCampaign2',
        'date': date,
        'conversions_rolling_5d_zscore': value,
    })

# s3 data - Account2 Campaign1
for i, (date, value) in enumerate(zip(dates, s3)):
    dummy_data.append({
        'account_name': 'TestAccount2',
        'campaign': 'TestCampaign1',
        'date': date,
        'conversions_rolling_5d_zscore': value,
    })

dummy_df = pd.DataFrame(dummy_data)
print(f"\nDummy DataFrame shape: {dummy_df.shape}")
print(f"Columns: {dummy_df.columns.tolist()}")

# 3. Convert to time series format using current approach
TARGET_METRIC = "conversions_rolling_5d_zscore"
ID_COLUMNS = ['account_name', 'campaign']
DATE_COLUMN = "date"

# Create pivot table (same as main code)
ml_ts = dummy_df.pivot_table(
    values=TARGET_METRIC,
    index=DATE_COLUMN,
    columns=ID_COLUMNS,
    fill_value=0,
    aggfunc='mean'
)

# Fix multi-level columns (same as main code)
if isinstance(ml_ts.columns, pd.MultiIndex):
    ml_ts.columns = [
        '_'.join([str(col) for col in multi_col if str(col) != ''])
        for multi_col in ml_ts.columns
    ]

ml_ts = ml_ts.fillna(0)

print(f"Time series matrix shape: {ml_ts.shape}")
print(f"Time series columns: {ml_ts.columns.tolist()}")

# 4. Test DTW functions with current parameters
dtw_params = {
    'window': 4, 
    'max_it': 10, 
    'max_dba_it': 10, 
    'use_parallel': False
}

# DTW distance matrix
dtw_distance_matrix = get_dtw_dist(ml_ts, dtw_params)
print(f"\nDTW Distance Matrix:")
print(dtw_distance_matrix)

# 5. Perform clustering for different k values
print(f"\n--- Clustering Analysis ---")
cluster_results = []
k_range = range(2, min(4, ml_ts.shape[1] + 1))  # Test 2-3 clusters

for k in k_range:
    cluster_result, params = get_dtw_clusters(
        ml_ts=ml_ts,
        number_of_components=k,
        **dtw_params
    )
    cluster_results.append((cluster_result, params))
    
    print(f"\nClustering with k={k}:")
    ts_names = ml_ts.columns.tolist()
    for cluster_id, member_indices in cluster_result.items():
        print(f"  Cluster {cluster_id}: {[ts_names[idx] for idx in member_indices]}")

# 6. Silhouette Analysis
if len(cluster_results) > 1:
    print(f"\n--- Silhouette Analysis ---")
    s_coeff, opt_numb_of_comp, opt_partition, s_scores = silhouette_coefficient(
        cluster_list=cluster_results,
        ml_ts=ml_ts,
        dtw_clusters_params=dtw_params,
        method="sklearn",
    )
    
    print(f"Silhouette scores for each k: {s_scores}")
    print(f"Optimal number of clusters: {opt_numb_of_comp}")
    print(f"Best silhouette coefficient: {s_coeff:.3f}")
    
    # Show optimal partition
    print(f"\nOptimal Partition:")
    ts_names = ml_ts.columns.tolist()
    for cluster_id, member_indices in opt_partition.items():
        print(f"  Cluster {cluster_id}: {[ts_names[idx] for idx in member_indices]}")

# 7. Compare with original basic DTW
print(f"\n--- DTW Distance Comparison ---")
print(f"Direct DTW distance (s1 vs s2): {basic_distance:.3f}")
print(f"DTW matrix (series 0 vs 1): {dtw_distance_matrix[0][1]:.3f}")
print(f"DTW matrix (series 0 vs 2): {dtw_distance_matrix[0][2]:.3f}")
print(f"DTW matrix (series 1 vs 2): {dtw_distance_matrix[1][2]:.3f}")

# 8. Visualize the time series
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: All time series together
ts_names = ml_ts.columns.tolist()
axes[0, 0].plot(ml_ts.index, ml_ts.iloc[:, 0], 'b-', label=ts_names[0], marker='o')
axes[0, 0].plot(ml_ts.index, ml_ts.iloc[:, 1], 'r-', label=ts_names[1], marker='s')
axes[0, 0].plot(ml_ts.index, ml_ts.iloc[:, 2], 'g-', label=ts_names[2], marker='^')
axes[0, 0].set_title("All Test Time Series")
axes[0, 0].legend()
axes[0, 0].grid(True)
axes[0, 0].tick_params(axis='x', rotation=45)

# Plot 2: DTW Distance Matrix Heatmap
im = axes[0, 1].imshow(dtw_distance_matrix, cmap='viridis', aspect='auto')
axes[0, 1].set_title("DTW Distance Matrix")
axes[0, 1].set_xticks(range(len(ts_names)))
axes[0, 1].set_yticks(range(len(ts_names)))
axes[0, 1].set_xticklabels([f"TS{i}" for i in range(len(ts_names))])
axes[0, 1].set_yticklabels([f"TS{i}" for i in range(len(ts_names))])
plt.colorbar(im, ax=axes[0, 1])

# Add distance values as text
for i in range(len(ts_names)):
    for j in range(len(ts_names)):
        axes[0, 1].text(j, i, f'{dtw_distance_matrix[i][j]:.1f}', 
                       ha='center', va='center', color='white')

# Plot 3: Individual time series (subplots)
colors = ['blue', 'red', 'green']
for i in range(len(ts_names)):
    axes[1, 0].subplot(3, 1, i+1) if i == 0 else None
    if i == 0:
        axes[1, 0].clear()
    
axes[1, 0].plot(ml_ts.index, ml_ts.iloc[:, 0], 'b-', marker='o', linewidth=2)
axes[1, 0].set_title(f"Time Series: {ts_names[0]}")
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Silhouette scores if available
if len(cluster_results) > 1 and 's_scores' in locals():
    axes[1, 1].bar(range(len(s_scores)), s_scores, color='skyblue', edgecolor='navy')
    axes[1, 1].set_xlabel('Number of Clusters')
    axes[1, 1].set_ylabel('Silhouette Score')
    axes[1, 1].set_title('Silhouette Analysis')
    axes[1, 1].set_xticks(range(len(s_scores)))
    axes[1, 1].set_xticklabels([str(k) for k in k_range])
    axes[1, 1].grid(True, alpha=0.3)
    
    # Mark optimal
    if 'opt_numb_of_comp' in locals():
        opt_idx = opt_numb_of_comp - min(k_range)
        if 0 <= opt_idx < len(s_scores):
            axes[1, 1].bar(opt_idx, s_scores[opt_idx], color='orange', edgecolor='red', 
                          label=f'Optimal (k={opt_numb_of_comp})')
            axes[1, 1].legend()
else:
    axes[1, 1].text(0.5, 0.5, 'Not enough data\nfor silhouette analysis', 
                   ha='center', va='center', transform=axes[1, 1].transAxes)
    axes[1, 1].set_title('Silhouette Analysis')

plt.tight_layout()
plt.show()

# 9. Summary Statistics
print(f"\n--- Summary Statistics ---")
print(f"Time series shape: {ml_ts.shape}")
print(f"Date range: {ml_ts.index.min()} to {ml_ts.index.max()}")
print(f"Time series statistics:")
for i, col in enumerate(ml_ts.columns):
    series_data = ml_ts.iloc[:, i]
    print(f"  {col}:")
    print(f"    Mean: {series_data.mean():.2f}")
    print(f"    Std:  {series_data.std():.2f}")
    print(f"    Min:  {series_data.min():.2f}")
    print(f"    Max:  {series_data.max():.2f}")

