In [238]:
import pandas as pd
from scipy.spatial.distance import cosine
import numpy as np
from similarity_data import countries
from similarity_data import clusters

In [239]:
def compute_similarity(df1, df2, label):
    df1_filtered = df1[df1['general_label'] == label]
    df2_filtered = df2[df2['general_label'] == label]
    
    common_dates = set(df1_filtered['date']).intersection(df2_filtered['date'])
    df1_filtered = df1_filtered[df1_filtered['date'].isin(common_dates)]
    df2_filtered = df2_filtered[df2_filtered['date'].isin(common_dates)]
    
    df1_filtered = df1_filtered.sort_values('date').reset_index(drop=True)
    df2_filtered = df2_filtered.sort_values('date').reset_index(drop=True)
    
    if len(df1_filtered) < 2 or len(df2_filtered) < 2:
        return None 
    
    traffic_rate_similarity = 1 - cosine(df1_filtered['traffic_rate'], df2_filtered['traffic_rate'])

    return {
        'traffic_rate_similarity': traffic_rate_similarity,
        'common_days': len(common_dates)
    }


In [240]:

#0 0 1
#1 2 2 
topic = clusters[6]
region = 'west_asia'
africa = countries[region]
country1 = africa[1]
country2 = africa[2]

file1 = f'./output/regions/{region}/genral_labeled_data_with_relative_traffic_rates/{topic}/{country1}_with_relative_traffic_rates.csv'   
file2 = f'./output/regions/{region}/genral_labeled_data_with_relative_traffic_rates/{topic}/{country2}_with_relative_traffic_rates.csv'  

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

categories = df1['general_label'].unique()

results = {}
for category in categories:
    similarity = compute_similarity(df1, df2, category)
    if similarity:
        results[category] = similarity

for category, result in results.items():
    print(f"Category: {category}")
    print('copmared:', country1 + '-' + country2)
    print(f"Traffic Rate Similarity: {result['traffic_rate_similarity']* 100:.2f}%")

Category: Intellectualism
copmared: Philippines-Singapore
Traffic Rate Similarity: 49.23%
