# Preparation

## Load bibs

In [None]:
from my_functions.functions_analysis import DataPreparation 
from my_functions.functions_analysis import DimensionalityReduction
from my_functions.functions_analysis import Clustering
from my_functions.functions_analysis import Vizualization

# Data Preparation 

## Load data

### Shape files

In [None]:
# Beispielhafte Nutzung
state_path='data/original_data/pkl/state.pickle'
county_path='data/original_data/pkl/county.pickle'

(
    state_shape,
    county_shape,
    filtered_state_shape_north_america,
    filtered_state_shape_alaska,
    filtered_county_shape_north_america,
    filtered_county_shape_alaska
) = DataPreparation.prepare_us_geometries_with_original_names(state_path, county_path)


### County data

In [None]:
import pandas as pd
county_master = pd.read_pickle('data/original_data/pkl/county_information.pkl')
county_master.head()

### US industry regions

In [None]:
industry_regions, multi_region_states = DataPreparation.assign_industry_regions(filtered_state_shape_north_america)

print("States assigned to multiple regions:", multi_region_states)

In [None]:
unique_missing, us_territories = DataPreparation.find_missing_us_territories(county_master, industry_regions)

print("Missing states/territories:")
print(unique_missing)

### Feature Dataframe

In [None]:
df_original= DataPreparation.load_and_filter_feature_dataframe(
    'data/processed_data/pkl/feature_df.pkl',
    us_territories
)

df_original.head()

In [None]:
df_original.info()

# Scaling

In [None]:
df_scaled_log,df_scaled_features_log, df_original_features = DataPreparation.apply_log10_scaling(df_original)

In [None]:
feature_id_dict = {feature: "F" + str(i+1) for i, feature in enumerate(df_original_features.columns)}
feature_id_dict

In [None]:
DataPreparation.plot_feature_distributions_before_after_log(df_original, df_scaled_log)

In [None]:
DataPreparation.plot_histograms(df_original, df_scaled_log)

In [None]:
# Optional: Descriptions of your features
descriptions = [
    "SOC 51-9022, Grinding, Polishing by Hand",
    "SOC 51-4121, Welders, Cutters, Solderers",
    "SOC 49-9041, Industrial Machinery Mechanics",
    "SOC 49-9071, Maintenance and Repair Workers",
    "SOC 51-4033, Grinding, Lapping, Polishing",
    "SOC 51-4035, Milling and Planing Machine Setters",
    "SOC 47-2211, Sheet Metal Workers",
    "SOC 51-2041, Structural Metal Fabricators",
    "NAICS 3315, Foundries", 
    "NAICS 3364, Aerospace",
    "NAICS 3366, Shipbuilding",
    "NAICS 3335, Metalworking Machines Manufacturing", 
    "NAICS 3320A1, Steel forming", 
    "NAICS 3320A2, Structural Metals Manufacturing",
    "NAICS 3327, Machine Shops", 
    "NAICS 3312, Steel Product Manufacturing", 
    "NAICS 3314, Nonferrous Metal Production",
    "NAICS 3361/3362, Automotive"
]

# Create summary
summary_df, styled_summary = DataPreparation.create_feature_summary_table(df_original, descriptions=descriptions)

# Display styled table (e.g., in Jupyter Notebook)
styled_summary


# Dimensionality Reduction

In [None]:
df_scaled = df_scaled_log
df_scaled_features = df_scaled_features_log

### TSNE

In [None]:
X_tsne = DimensionalityReduction.plot_tsne_2d(df_scaled_features)

### UMAP

In [None]:
embedding = DimensionalityReduction.plot_umap_2d(df_scaled_features)

## PCA

In [None]:
X_pca_2d, X_pca_3d = DimensionalityReduction.plot_pca_2d_3d(df_scaled_log)

# Clustering

## HDBScan with full feature space

In [None]:
param_grid = {
    'min_cluster_size': range(30, 50, 2),
    'min_samples': range(1, 10)
}

# Optional: true_labels = df_scaled['true_label']

results_df = Clustering.run_hdbscan_gridsearch(
    df_scaled=df_scaled_log,
    param_grid=param_grid,
    true_labels=None,  # oder true_labels, falls vorhanden
    save_path='data/processed_data/pkl/hdbscan_results_df.pickle'
)
results_df

In [None]:
best_hdbscan_full_space = Clustering.select_best_hdbscan_result(
    result_path='data/processed_data/pkl/hdbscan_results_df.pickle',
    noise_threshold=0.1,
    min_clusters=2,
    min_persistence=0.5
)

print("Best HDBSCAN Full Space config:\n", best_hdbscan_full_space)


## HDBScan on t-SNE

In [None]:
param_grid = {
    'min_cluster_size': range(30, 50, 2),
    'min_samples': range(1, 10)
}

hdbscan_results_tsne_df = Clustering.run_hdbscan_gridsearch_tsne(
    tsne_embedding=X_tsne,
    param_grid=param_grid,
    save_path='data/processed_data/pkl/hdbscan_results_tsne_df.pickle'
)


In [None]:
best_hdbscan_tsne = Clustering.select_best_hdbscan_reduced_result(
    result_path='data/processed_data/pkl/hdbscan_results_tsne_df.pickle',
    noise_threshold=0.1,
    min_clusters=2,
    min_persistence=0.5
)

print("Best HDBSCAN (reduced space):")
print(best_hdbscan_tsne)


## DBScan on t-SNE components

In [None]:
import numpy as np
param_grid = {
    'eps': np.arange(2, 5.5, 0.5),
    'min_samples': range(20, 46, 5)
}

dbscan_results_df = Clustering.run_dbscan_gridsearch_tsne(
    tsne_data=X_tsne,
    param_grid=param_grid,
    save_path='data/processed_data/pkl/dbscan_results_df.pickle'
)

In [None]:
best_dbscan_result = Clustering.select_best_dbscan_result(
    result_path='data/processed_data/pkl/dbscan_results_df.pickle',
    min_clusters=2,
    noise_threshold=0.2,
    strict_noise_preference=0.05
)

print("Best DBSCAN result:\n", best_dbscan_result)


## Vizualization clustering methods

In [None]:
df_hdbscan_full, df_dbscan_tsne, df_hdbscan_tsne = Clustering.apply_and_color_clusters(
    df_scaled=df_scaled,
    X_tsne=X_tsne,
    hdbscan_full_space_best_result=best_hdbscan_full_space,
    hdbscan_reduced_space_best_result=best_hdbscan_tsne,
    dbscan_best_params=best_dbscan_result  # Oder dynamisch
)


In [None]:
Clustering.plot_cluster_comparison_tsne(
    X_tsne=X_tsne,
    df_dbscan_tsne=df_dbscan_tsne,
    df_hdbscan_tsne=df_hdbscan_tsne,
    df_hdbscan_full=df_hdbscan_full
)


## Metrics

In [None]:
labels_dict = {
    'HDBSCAN (full feature set)': df_hdbscan_full['cluster_id'].values,
    'HDBSCAN (t-SNE)': df_hdbscan_tsne['cluster_id'].values,
    'DBSCAN (t-SNE)': df_dbscan_tsne['cluster_id'].values
}

data_dict = {
    'HDBSCAN (full feature set)': df_scaled.iloc[:, 1:],  # Ohne ID-Spalte und ggf. Zielspalten
    'HDBSCAN (t-SNE)': X_tsne,
    'DBSCAN (t-SNE)': X_tsne
}

# Jetzt evaluieren:
df_metrics, pairwise_metrics = Clustering.evaluate_clusterings(labels_dict, data_dict)

# Anzeigen:
print("Internal clustering metrics:")
display(df_metrics)

print("\nPairwise comparison (ARI/NMI):")
display(pairwise_metrics)


# Vizualizations

## Feature values in t-SNE plot

In [None]:
Vizualization.plot_tsne_feature_maps(
    X_tsne=X_tsne,
    df_features=df_scaled_features,
    feature_id_prefix="F",
    save_path=None  # Oder z. B. 'output/tsne_feature_plots'
)


In [None]:
features_to_plot = [
    'total_emp_occu_51-4033',
    'total_emp_naics_3335',
    'total_emp_naics_3364',
    'total_emp_naics_3366'
]

Vizualization.plot_tsne_feature_grid(
    X_tsne=X_tsne,
    df_features=df_scaled_features,
    features_to_plot=features_to_plot,
    feature_id_dict=feature_id_dict,
    save_path='tsne_feature_values.png'
)


## Cluster on map

In [None]:
df_scaled_ranked, cluster_summary = Vizualization.compute_cluster_rankings(
    df_scaled=df_scaled,
    cluster_df=df_hdbscan_tsne,  # oder df_hdbscan_full
    cluster_col='cluster_id',
    color_col='color',
    exclude_columns=3  # z. B. 'cluster_id', 'color', 'rank'
)

# Ausgabe prüfen
display(cluster_summary.head())
display(df_scaled_ranked.head())


In [None]:
filtered_county_shape_north_america_merged, filtered_county_shape_alaska_merged = Vizualization.plot_clustered_us_counties(
    filtered_county_shape_north_america,
    filtered_county_shape_alaska,
    filtered_state_shape_north_america,
    df_scaled_ranked
)

In [None]:
Vizualization.plot_top_ranked_and_noise_counties(
    filtered_county_shape_north_america_merged,
    filtered_state_shape_north_america
)


## Feature importance HDBSCAN (reduced space) 

In [None]:
Vizualization.plot_feature_importance_heatmap(
    df_scaled=df_scaled_ranked,
    df_original_features=df_original_features,
    output_file="feature_importance_heatmap.png"
)


## Cluster feature values bar plot

In [None]:
Vizualization.plot_cluster_feature_bars(
    df_original=df_original,
    df_scaled=df_scaled_ranked,
    selected_clusters=[14, 13, 12, 10, 3],
    occupations={
        "51-9022": "Grinding, Polishing by Hand",
        "51-4121": "Welders, Cutters, Solderers",
        "49-9041": "Industrial Machinery Mechanics",
        "49-9071": "Maintenance and Repair Workers",
        "51-4033": "Grinding, Lapping, Polishing",
        "51-4035": "Milling and Planing Machine Setters",
        "47-2211": "Sheet Metal Workers",
        "51-2041": "Structural Metal Fabricators"
    },
    naics={
        "3315": "Foundries",
        "Automotive": "Automotive",
        "3364": "Aerospace",
        "3366": "Shipbuilding",
        "3335": "Metalworking Machines Manufacturing",
        "3320A1": "Steel forming",
        "3320A2": "Structural Metals Manufacturing",
        "3327": "Machine Shops",
        "3312": "Steel Product Manufacturing",
        "3314": "Nonferrous Metal Production"
    }
)
