In [210]:
import pandas as pd

In [211]:
def clean_traffic(value):
    value = value.replace(",", "").replace("+", "").strip()
    return int(value)

def calculate_traffic_rate(value, max):
    if max == 0: 
        return 0.0
    rate = float(value / max)
    epsilon = 1e-9  
    if rate <= 0 + epsilon:
        return 0.0
    elif rate < 0.25 - epsilon:
        return 0.1
    elif rate < 0.5 - epsilon:
        return 1/4
    elif rate < 0.75 - epsilon:
        return 1/2
    else:  
        return 1.0

clusters = ['Economy','Technology and Science', 'Entertainment','Lifestyle','Accident','Geopolitical','Intellectualism']
countries = {'africa':['Kenya','Nigeria','SouthAfrica'],'europe':['Denmark','UK','Finland'],'north_america_australia':['Australia','Canada','USA'],'west_asia':['Malaysia','Philippines','Singapore']}

In [212]:
country = countries['north_america_australia'][2]
#--------general labels--------#
# path = f"./classified_data/regions/africa/clustered_classified_data/{country}_clustered_classified.csv"
# path = f'./classified_data/regions/europe/clustered_classified_data/{country}_clustered_classified.csv'
path = f'./output/regions/north_america_australia/clustered_classified_data/{country}_clustered_classified.csv'
# path = f'./classified_data/regions/west_asia/clustered_classified_data/{country}_clustered_classified.csv'
data = pd.read_csv(path)

In [213]:
# Ensure clean_traffic, calculate_traffic_rate, and clusters are defined.

# Data Preparation
df = data[['date', 'traffic', 'general_label']].copy()

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Clean traffic and calculate traffic numeric
df['traffic_numeric'] = df['traffic'].apply(clean_traffic)

# Calculate traffic rate
maxTraffic = df['traffic_numeric'].max()
df['traffic_rate'] = df['traffic_numeric'].apply(lambda x: calculate_traffic_rate(x, maxTraffic))
print("First max traffic:", maxTraffic)

# Aggregate data by date, label, and traffic rate
category_time_distribution = (
    df.groupby(['date', 'general_label', 'traffic_rate'])['traffic_numeric']
    .sum()
    .reset_index(name='total_traffic')
)

# Filter for a specific category
specific_category = clusters[1]  # Ensure clusters is defined
specific_category_data = category_time_distribution[
    category_time_distribution['general_label'] == specific_category
]

# Recalculate traffic rate for the specific category
maxTraffic = specific_category_data['total_traffic'].max()
print("Second max traffic:", maxTraffic)
specific_category_data['traffic_rate'] = specific_category_data['total_traffic'].apply(
    lambda x: calculate_traffic_rate(x, maxTraffic)
)
# Check for duplicate dates and resolve them
if specific_category_data['date'].duplicated().any():
    print("Duplicates found in 'date'. Aggregating data.")
    specific_category_data = (
        specific_category_data.groupby(['date', 'general_label'], as_index=False)
        .agg({'traffic_rate': 'mean', 'total_traffic': 'sum'})  # Adjust aggregation as needed
    )

# Ensure the 'date' column is unique before reindexing
specific_category_data = specific_category_data.set_index('date')

# Create the date range
start_date = pd.to_datetime('2016-11-28')
end_date = pd.to_datetime('2017-05-04')
date_index = pd.date_range(start_date, end_date)

# Reindex to include all dates in the range
specific_category_data = specific_category_data.reindex(date_index)

# Fill missing values with defaults
specific_category_data.fillna({
    'general_label': specific_category,
    'traffic_rate': 0,
    'total_traffic': 0
}, inplace=True)

# Reset index and rename columns
specific_category_data.reset_index(inplace=True)
specific_category_data.rename(columns={'index': 'date'}, inplace=True)

specific_category_data.to_csv(f'./output/regions/north_america_australia/genral_labeled_data_with_relative_traffic_rates/{country}_with_relative_traffic_rates.csv', index=False)

specific_category_data



First max traffic: 10000000
Second max traffic: 5000000
Duplicates found in 'date'. Aggregating data.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  specific_category_data['traffic_rate'] = specific_category_data['total_traffic'].apply(


Unnamed: 0,date,general_label,traffic_rate,total_traffic
0,2016-11-28,Technology and Science,0.1,300000.0
1,2016-11-29,Technology and Science,0.1,600000.0
2,2016-11-30,Technology and Science,0.0,0.0
3,2016-12-01,Technology and Science,0.0,0.0
4,2016-12-02,Technology and Science,0.0,0.0
...,...,...,...,...
153,2017-04-30,Technology and Science,0.0,0.0
154,2017-05-01,Technology and Science,0.0,0.0
155,2017-05-02,Technology and Science,0.0,0.0
156,2017-05-03,Technology and Science,0.1,150000.0
