In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [None]:
df = pd.read_csv("C:\Users\meria\Downloads\ML\MLproject\prepared_data.csv")

In [None]:
df.info()

In [None]:
features = [
    "Median Salary (USD)", "Experience Required (Years)", "Job Openings (2024)",
    "Projected Openings (2030)", "Remote Work Ratio (%)", "Automation Risk (%)",
    "Gender Diversity (%)", "Education_Num", "AI_Impact_Num", "Job_Status_Num",
    "Industry_education","Industry_entertainment","Industry_finance","Industry_healthcare",
    "Industry_it","Industry_manufacturing","Industry_retail","Industry_transportation",
    "Location_australia","Location_brazil","Location_canada","Location_china","Location_germany",
    "Location_india","Location_uk","Location_usa"
]

X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

linkage_matrix = linkage(X_scaled, method='ward')
plt.figure(figsize=(14, 6))
dendrogram(linkage_matrix, truncate_mode='lastp', p=10, leaf_rotation=45., leaf_font_size=12., show_contracted=True)
plt.title("Dendrogramme du Clustering Hiérarchique")
plt.xlabel("Clusters")
plt.ylabel("Distance (Ward)")
plt.show()


In [None]:
clusters = fcluster(linkage_matrix, t=4, criterion='maxclust')
df["Cluster"] = clusters

In [None]:
cluster_summary = df.groupby("Cluster")[features].mean()
display(cluster_summary)


In [None]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
sns.scatterplot(x=pca_result[:,0], y=pca_result[:,1], hue=df["Cluster"], palette="Set2")
plt.title("Visualisation des Clusters (PCA)")
plt.xlabel("Composante principale 1")
plt.ylabel("Composante principale 2")
plt.show()


In [None]:
#evaluation
from sklearn.metrics import calinski_harabasz_score
ch_index = calinski_harabasz_score(X_scaled, df["Cluster"])
print(f"Calinski–Harabasz Index: {ch_index:.3f}")


In [None]:
print(" Interprétation des Clusters (Hierarchical Clustering) \n")

print("Cluster 1 : Experts IA / Data Science")
print("  - Salaire : élevé")
print("  - Risque d’automatisation : faible")
print("  - Éducation : forte (masters, doctorats)")
print("  - Interprétation : Métiers experts de l’IA, data scientists, chercheurs IA\n")

print("Cluster 2 : Développeurs / Ingénieurs logiciels")
print("  - Salaire : moyen")
print("  - Risque d’automatisation : modéré")
print("  - Secteur : principalement IT, finance, retail")
print("  - Interprétation : Métiers techniques intermédiaires du numérique\n")

print("Cluster 3 : Support technique / postes répétitifs")
print("  - Salaire : faible")
print("  - Risque d’automatisation : élevé")
print("  - Secteurs : manufacturing, retail, transport")
print("  - Interprétation : Métiers vulnérables à la robotisation, nécessitant reconversion\n")

print("Cluster 4 : Métiers hybrides et émergents")
print("  - Salaire : élevé")
print("  - Risque d’automatisation : faible")
print("  - Caractéristiques : interdisciplinaires (IA + santé / finance / éthique / éducation)")
print("  - Interprétation : Nouveaux métiers hybrides de l’IA (AI Policy, Data Ethics, Healthcare Analyst)")
