## Dane dla miast, dokonanie klasteryzacji względem geografi
Poniżej próba podzielenia miast geograficznie i przydzielenie im one-hot kodowania w celach eksploracji
potencjalnego zystku dla funkcji straty.

In [2]:
# importy
from pathlib import Path
import pandas as pd
import folium
from folium.plugins import MarkerCluster
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from collections import defaultdict
from sklearn.metrics.pairwise import haversine_distances
import numpy as np

from typing import Dict, List
import re
import os

In [3]:
csv_file = Path("Data//data_raw") / "city_attributes.csv" 
try:
    data = pd.read_csv(csv_file)
except FileNotFoundError:
    print(f"Error: The file {csv_file} was not found.")
    raise

required_columns = {'City', 'Country', 'Latitude', 'Longitude'}
if not required_columns.issubset(data.columns):
    missing = required_columns - set(data.columns)
    print(f"Error: CSV file is missing the following required columns: {missing}")
    raise Exception("Missing required columns in CSV.")

avg_lat = data['Latitude'].mean()
avg_lon = data['Longitude'].mean()

m = folium.Map(location=[avg_lat, avg_lon], zoom_start=2)

marker_cluster = MarkerCluster().add_to(m)

for idx, row in data.iterrows():
    city = row['City']
    country = row['Country']
    lat = row['Latitude']
    lon = row['Longitude']
    popup_text = f"{city}, {country}"
    folium.Marker(
        location=[lat, lon],
        popup=popup_text,
        tooltip=popup_text
    ).add_to(marker_cluster)

display(m)

output_file = 'city_map_with_clusters.html'
m.save(output_file)
print(f"Map has been saved to {output_file}")


Map has been saved to city_map_with_clusters.html


In [4]:
### Tworzenie klastrów - poniżej za pomocą algorytmu DBSCAN
coords = data[['Latitude', 'Longitude']].values
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)
coords_rad = np.radians(coords)

db = DBSCAN(eps=0.27, min_samples=2, metric='euclidean').fit(coords_scaled)
data['Cluster'] = db.labels_
data['Cluster'] = data['Cluster'].apply(lambda x: 'Noise' if x == -1 else f'cluster_{x+1}')

cluster_dict = defaultdict(list)
for _, row in data.iterrows():
    cluster = row['Cluster']
    city = row['City']
    if cluster != 'Noise': 
        cluster_dict[cluster].append(city)
    else:
        cluster_dict['Noise'].append(city)
cluster_dict_euclid = dict(cluster_dict)

db = DBSCAN(eps=0.06, min_samples=2, metric='haversine').fit(coords_rad)
data['Cluster'] = db.labels_
data['Cluster'] = data['Cluster'].apply(lambda x: 'Noise' if x == -1 else f'cluster_{x+1}')

cluster_dict = defaultdict(list)
for _, row in data.iterrows():
    cluster = row['Cluster']
    city = row['City']
    if cluster != 'Noise': 
        cluster_dict[cluster].append(city)
    else:
        cluster_dict['Noise'].append(city)
cluster_dict_haversine = dict(cluster_dict)

In [70]:
for cluster, cities in cluster_dict_euclid.items():
    print(f"{cluster}: {cities}")
print(f"Number of clusters: {len(cluster_dict_euclid)}")

Noise: ['Vancouver', 'Portland', 'San Francisco', 'Seattle', 'Dallas', 'Minneapolis', 'Jacksonville', 'Miami', 'Montreal', 'Eilat']
cluster_1: ['Los Angeles', 'San Diego', 'Phoenix']
cluster_2: ['Las Vegas', 'Albuquerque']
cluster_3: ['Denver', 'Kansas City', 'Saint Louis', 'Indianapolis', 'Pittsburgh', 'Philadelphia', 'New York']
cluster_4: ['San Antonio', 'Houston']
cluster_5: ['Chicago', 'Detroit', 'Toronto', 'Boston']
cluster_6: ['Nashville', 'Atlanta', 'Charlotte']
cluster_7: ['Beersheba', 'Tel Aviv District', 'Haifa', 'Nahariyya', 'Jerusalem']
Number of clusters: 8


In [85]:
for cluster, cities in cluster_dict_haversine.items():
    print(f"{cluster}: {cities}")
print(f"Number of clusters: {len(cluster_dict_haversine)}")

cluster_1: ['Vancouver', 'Portland', 'Seattle']
Noise: ['San Francisco', 'Phoenix', 'Albuquerque', 'Denver', 'Kansas City', 'Minneapolis', 'Jacksonville', 'Miami', 'Montreal']
cluster_2: ['Los Angeles', 'San Diego', 'Las Vegas']
cluster_3: ['San Antonio', 'Dallas', 'Houston']
cluster_4: ['Saint Louis', 'Chicago', 'Indianapolis']
cluster_5: ['Nashville', 'Atlanta', 'Charlotte']
cluster_6: ['Detroit', 'Pittsburgh', 'Toronto']
cluster_7: ['Philadelphia', 'New York', 'Boston']
cluster_8: ['Beersheba', 'Tel Aviv District', 'Eilat', 'Haifa', 'Nahariyya', 'Jerusalem']
Number of clusters: 9


In [12]:
## Ręcznie przypisywane klastry
## Ponieważ wszystkich danych nie ma dużo, to dokonamy przypisania klastów "wedle własnego uznania"
cluster_cities_manual = {
                  "cluster_1": ['Vancouver', 'Seattle', 'Portland'],
                  "cluster_2": ['San Francisco'],
                  "cluster_3": ['Los Angeles', 'San Diego'],
                  "cluster_4": ['Las Vegas', 'Phoenix'],
                  "cluster_5": ['Albuquerque'],
                  "cluster_6": ['San Antonio', 'Dallas', 'Houston'],
                  "cluster_7": ['Miami', 'Jacksonville'],
                  "cluster_8": ['Denver'],
                  "cluster_9": ['Nashville', 'Atlanta', 'Charlotte'],
                  "cluster_10": ['Pittsburgh'],
                  "cluster_11": ['Saint Louis', 'Kansas City', 'Indianapolis'],
                  "cluster_12": ['Detroit', 'Chicago'],
                  "cluster_13": ['Beersheba', 'Tel Aviv District', 'Eilat', 'Haifa', 'Nahariyya', 'Jerusalem'],
                  "cluster_14": ['Montreal', 'Toronto', 'Boston'],
                  "cluster_15": ['Philadelphia', 'New York'],
                  "cluster_16": ['Minneapolis']
                  }

In [5]:
def preprocess_city_data(
    cluster_dict: Dict[str, List[str]],
    input_dir: str,
    output_dir: str
) -> Dict[str, List[str]]:
    
    city_to_cluster = {}
    for cluster_label, cities in cluster_dict.items():
        if cluster_label != 'Noise':
            for city in cities:
                city_to_cluster[city] = cluster_label
        else:
            for city in cities:
                city_to_cluster[city] = f'Noise_{city.replace(" ", "_")}'

    unique_clusters = set(city_to_cluster.values())
    os.makedirs(output_dir, exist_ok=True)
    city_cluster_mapping = {}

    for filename in os.listdir(input_dir):
        if filename.lower().endswith('.csv'):
            file_path = os.path.join(input_dir, filename)

            match = re.match(r'^(.+?)_', filename)
            if match:
                city_raw = match.group(1)
                city = city_raw.replace('_', ' ')
            else:
                print(f"Skipping file '{filename}': Unable to extract city name.")
                continue 
            
            cluster_label = city_to_cluster.get(city)
            if not cluster_label:
                cluster_label = f'Noise_{city.replace(" ", "_")}'
                city_to_cluster[city] = cluster_label 
            
            one_hot_encoding = {cluster: 0 for cluster in unique_clusters}
            one_hot_encoding[cluster_label] = 1 
            
            try:
                df = pd.read_csv(file_path)
            except Exception as e:
                print(f"Error loading '{filename}': {e}")
                continue 
            
            for cluster, value in one_hot_encoding.items():
                df[cluster] = value
            
            if cluster_label.startswith('cluster_'):
                cluster_num = cluster_label.split('_', 1)[1]
            elif cluster_label.startswith('Noise_'):
                noise_suffix = cluster_label.split('_', 1)[1] 
                cluster_num = f'noise_{noise_suffix}'
            else:
                cluster_num = 'unknown'
            
            city_for_filename = city.replace(' ', '_')
            new_filename = f"{city_for_filename}_cluster_{cluster_num}_preprocessed.csv"
            output_path = os.path.join(output_dir, new_filename)
            
            try:
                df.to_csv(output_path, index=False)
                print(f"Processed and saved: '{new_filename}'")
            except Exception as e:
                print(f"Error saving '{new_filename}': {e}")
                continue 
            
            city_cluster_mapping[filename] = cluster_label
    
    return city_cluster_mapping


In [103]:
preprocess_city_data(cluster_dict_haversine, Path("Data/data_concat_transformed"), Path("Data/data_concat_clustered_haversine"))

Processed and saved: 'Albuquerque_cluster_noise_Albuquerque_preprocessed.csv'
Processed and saved: 'Atlanta_cluster_5_preprocessed.csv'
Processed and saved: 'Beersheba_cluster_8_preprocessed.csv'
Processed and saved: 'Boston_cluster_7_preprocessed.csv'
Processed and saved: 'Charlotte_cluster_5_preprocessed.csv'
Processed and saved: 'Chicago_cluster_4_preprocessed.csv'
Processed and saved: 'Dallas_cluster_3_preprocessed.csv'
Processed and saved: 'Denver_cluster_noise_Denver_preprocessed.csv'
Processed and saved: 'Detroit_cluster_6_preprocessed.csv'
Processed and saved: 'Eilat_cluster_8_preprocessed.csv'
Processed and saved: 'Haifa_cluster_8_preprocessed.csv'
Processed and saved: 'Houston_cluster_3_preprocessed.csv'
Processed and saved: 'Indianapolis_cluster_4_preprocessed.csv'
Processed and saved: 'Jacksonville_cluster_noise_Jacksonville_preprocessed.csv'
Processed and saved: 'Jerusalem_cluster_8_preprocessed.csv'
Processed and saved: 'Kansas_City_cluster_noise_Kansas_City_preprocessed.

{'Albuquerque_concatenated_preprocessed.csv': 'Noise_Albuquerque',
 'Atlanta_concatenated_preprocessed.csv': 'cluster_5',
 'Beersheba_concatenated_preprocessed.csv': 'cluster_8',
 'Boston_concatenated_preprocessed.csv': 'cluster_7',
 'Charlotte_concatenated_preprocessed.csv': 'cluster_5',
 'Chicago_concatenated_preprocessed.csv': 'cluster_4',
 'Dallas_concatenated_preprocessed.csv': 'cluster_3',
 'Denver_concatenated_preprocessed.csv': 'Noise_Denver',
 'Detroit_concatenated_preprocessed.csv': 'cluster_6',
 'Eilat_concatenated_preprocessed.csv': 'cluster_8',
 'Haifa_concatenated_preprocessed.csv': 'cluster_8',
 'Houston_concatenated_preprocessed.csv': 'cluster_3',
 'Indianapolis_concatenated_preprocessed.csv': 'cluster_4',
 'Jacksonville_concatenated_preprocessed.csv': 'Noise_Jacksonville',
 'Jerusalem_concatenated_preprocessed.csv': 'cluster_8',
 'Kansas City_concatenated_preprocessed.csv': 'Noise_Kansas_City',
 'Las Vegas_concatenated_preprocessed.csv': 'cluster_2',
 'Los Angeles_conc

In [101]:
preprocess_city_data(cluster_dict_euclid, Path("Data/data_concat_transformed"), Path("Data/data_concat_clustered"))

Processed and saved: 'Albuquerque_cluster_2_preprocessed.csv'
Processed and saved: 'Atlanta_cluster_6_preprocessed.csv'
Processed and saved: 'Beersheba_cluster_7_preprocessed.csv'
Processed and saved: 'Boston_cluster_5_preprocessed.csv'
Processed and saved: 'Charlotte_cluster_6_preprocessed.csv'
Processed and saved: 'Chicago_cluster_5_preprocessed.csv'
Processed and saved: 'Dallas_cluster_noise_Dallas_preprocessed.csv'
Processed and saved: 'Denver_cluster_3_preprocessed.csv'
Processed and saved: 'Detroit_cluster_5_preprocessed.csv'
Processed and saved: 'Eilat_cluster_noise_Eilat_preprocessed.csv'
Processed and saved: 'Haifa_cluster_7_preprocessed.csv'
Processed and saved: 'Houston_cluster_4_preprocessed.csv'
Processed and saved: 'Indianapolis_cluster_3_preprocessed.csv'
Processed and saved: 'Jacksonville_cluster_noise_Jacksonville_preprocessed.csv'
Processed and saved: 'Jerusalem_cluster_7_preprocessed.csv'
Processed and saved: 'Kansas_City_cluster_3_preprocessed.csv'
Processed and sav

{'Albuquerque_concatenated_preprocessed.csv': 'cluster_2',
 'Atlanta_concatenated_preprocessed.csv': 'cluster_6',
 'Beersheba_concatenated_preprocessed.csv': 'cluster_7',
 'Boston_concatenated_preprocessed.csv': 'cluster_5',
 'Charlotte_concatenated_preprocessed.csv': 'cluster_6',
 'Chicago_concatenated_preprocessed.csv': 'cluster_5',
 'Dallas_concatenated_preprocessed.csv': 'Noise_Dallas',
 'Denver_concatenated_preprocessed.csv': 'cluster_3',
 'Detroit_concatenated_preprocessed.csv': 'cluster_5',
 'Eilat_concatenated_preprocessed.csv': 'Noise_Eilat',
 'Haifa_concatenated_preprocessed.csv': 'cluster_7',
 'Houston_concatenated_preprocessed.csv': 'cluster_4',
 'Indianapolis_concatenated_preprocessed.csv': 'cluster_3',
 'Jacksonville_concatenated_preprocessed.csv': 'Noise_Jacksonville',
 'Jerusalem_concatenated_preprocessed.csv': 'cluster_7',
 'Kansas City_concatenated_preprocessed.csv': 'cluster_3',
 'Las Vegas_concatenated_preprocessed.csv': 'cluster_2',
 'Los Angeles_concatenated_prepr

In [104]:
preprocess_city_data(cluster_cities_manual, Path("Data/data_concat_transformed"), Path("Data/data_concat_clustered_manual"))

Processed and saved: 'Albuquerque_cluster_5_preprocessed.csv'
Processed and saved: 'Atlanta_cluster_9_preprocessed.csv'
Processed and saved: 'Beersheba_cluster_13_preprocessed.csv'
Processed and saved: 'Boston_cluster_14_preprocessed.csv'
Processed and saved: 'Charlotte_cluster_9_preprocessed.csv'
Processed and saved: 'Chicago_cluster_12_preprocessed.csv'
Processed and saved: 'Dallas_cluster_6_preprocessed.csv'
Processed and saved: 'Denver_cluster_8_preprocessed.csv'
Processed and saved: 'Detroit_cluster_12_preprocessed.csv'
Processed and saved: 'Eilat_cluster_13_preprocessed.csv'
Processed and saved: 'Haifa_cluster_13_preprocessed.csv'
Processed and saved: 'Houston_cluster_6_preprocessed.csv'
Processed and saved: 'Indianapolis_cluster_11_preprocessed.csv'
Processed and saved: 'Jacksonville_cluster_7_preprocessed.csv'
Processed and saved: 'Jerusalem_cluster_13_preprocessed.csv'
Processed and saved: 'Kansas_City_cluster_11_preprocessed.csv'
Processed and saved: 'Las_Vegas_cluster_4_prep

{'Albuquerque_concatenated_preprocessed.csv': 'cluster_5',
 'Atlanta_concatenated_preprocessed.csv': 'cluster_9',
 'Beersheba_concatenated_preprocessed.csv': 'cluster_13',
 'Boston_concatenated_preprocessed.csv': 'cluster_14',
 'Charlotte_concatenated_preprocessed.csv': 'cluster_9',
 'Chicago_concatenated_preprocessed.csv': 'cluster_12',
 'Dallas_concatenated_preprocessed.csv': 'cluster_6',
 'Denver_concatenated_preprocessed.csv': 'cluster_8',
 'Detroit_concatenated_preprocessed.csv': 'cluster_12',
 'Eilat_concatenated_preprocessed.csv': 'cluster_13',
 'Haifa_concatenated_preprocessed.csv': 'cluster_13',
 'Houston_concatenated_preprocessed.csv': 'cluster_6',
 'Indianapolis_concatenated_preprocessed.csv': 'cluster_11',
 'Jacksonville_concatenated_preprocessed.csv': 'cluster_7',
 'Jerusalem_concatenated_preprocessed.csv': 'cluster_13',
 'Kansas City_concatenated_preprocessed.csv': 'cluster_11',
 'Las Vegas_concatenated_preprocessed.csv': 'cluster_4',
 'Los Angeles_concatenated_preprocess

In [9]:
def concatenate_clusters(cluster_dict, input_dir, output_dir, date_key = "datetime"):
    os.makedirs(output_dir, exist_ok=True)
    for cluster, cities in cluster_dict.items():
        if cluster != 'Noise':
            dfs = []
            for city in cities:
                pattern = f'^{re.escape(city.replace(" ", "_"))}_.*\.csv$'
                for file in os.listdir(input_dir):
                    if re.match(pattern, file):
                        df = pd.read_csv(os.path.join(input_dir, file))
                        dfs.append(df)
                        break
            if dfs:
                concatenated = pd.concat(dfs, ignore_index=True).sort_values(by=date_key)
                output_file = f"{cluster}_preprocessed.csv"
                concatenated.to_csv(os.path.join(output_dir, output_file), index=False)
        else:
            for city in cities:
                pattern = f'^{re.escape(city.replace(" ", "_"))}_.*\.csv$'
                for file in os.listdir(input_dir):
                    if re.match(pattern, file):
                        df = pd.read_csv(os.path.join(input_dir, file)).sort_values(by=date_key)
                        output_file = f"{city.replace(' ', '_')}_preprocessed.csv"
                        df.to_csv(os.path.join(output_dir, output_file), index=False)
                        break

In [110]:
concatenate_clusters(cluster_dict_euclid, Path("Data/data_concat_transformed"), Path("Data/data_concat_clustered_concatenated"))
concatenate_clusters(cluster_cities_manual, Path("Data/data_concat_transformed"), Path("Data/data_concat_clustered_concatenated_manual"))
concatenate_clusters(cluster_dict_haversine, Path("Data/data_concat_transformed"), Path("Data/data_concat_clustered_concatenated_haversine"))

In [10]:
concatenate_clusters(cluster_dict_euclid, Path("Data/data_concatenated"), Path("Data/data_clustered_nontransformed"))
concatenate_clusters(cluster_cities_manual, Path("Data/data_concatenated"), Path("Data/data_clustered_nontransformed_manual"))
concatenate_clusters(cluster_dict_haversine, Path("Data/data_concatenated"), Path("Data/data_clustered_nontransformed_haversine"))