## Prepping Data for Clustering

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.sparse import hstack
from joblib import parallel_backend

from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
from pattern.nl import lemma
#nltk.download('punkt')


import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

# Loading datasets

In [None]:
#load all datasets seperately


In [None]:
#for those structured visually not practically need to edit slightly because the use of merging leaves a lot of empty fields

#first remove all the rows which do not reference a maatregel or a project
Project_A = Project_A[(Project_A['Beheersmaatregel'].notna()) | (Project_A['Gebeurtenis'].notna())].reset_index(drop=True)

#now make sure the project info is there for all rows which have a maatregel
for i in range(len(Project_A['Risico code'])):
    if pd.isnull(Project_A.loc[i,'Risico code']):
        Project_A.iloc[i, 0:17] = Project_A.iloc[i-1,0:17]
        Project_A.iloc[i, 21:] = Project_A.iloc[i-1,21:] 
    


Different types of things to vectorise on
- name + description
- Cause and effect + Measures taken and Measure type
- Category + Allocatie 

# Choosing standard column names and converting everything

In [None]:
projects = []
names = []
for (key, value) in globals().copy().items():
    if isinstance(value, pd.DataFrame):
        projects.append(value)
        names.append(key)

wanted_columns = ['Naam', 'Omschrijving', 'Oorzaak', 'Gevolg', 'Maatregel', 'Type Maatregel', 'Categorie', 'Allocatie', 'Kans I', 'Tijd I', 'Geld I', 'Project']


In [None]:
#now I need to add the name of the dataframe as the last columns for each dataframe
for i in range(len(projects)):
    projects[i]['Project'] = names[i]

In [None]:
#making column names consistent

for i in range(len(projects)):
    for col_name in wanted_columns:
        if col_name not in projects[i].columns:
            print(names[i], col_name)


In [None]:
#rename columns which had wrong names. I had 27 datasets which had up to four wrong column names


In [None]:
#some projects dont have certain columns but we add blank ones to not cause errors
Project_A['Maatregel'] = np.nan
Project_A['Type Maatregel'] = np.nan
Project_B['Omschrijving'] = np.nan
Project_B['Type Maatregel'] = np.nan
Project_B['Categorie'] = np.nan

In [None]:
#check column names
for i in range(len(projects)):
    for col_name in wanted_columns:
        if col_name not in projects[i].columns:
            print(names[i], col_name)

In [None]:
#combine with another dataset
combined_data = pd.concat((project for project in projects), ignore_index=True)


In [None]:
#turning NaNs into a textual value because an NaN can also have a meaning and dropping not neeeded columns
for i in wanted_columns:
    combined_data[i].fillna('None', inplace=True)

combined_data = combined_data[wanted_columns]


# Checking the values within the columns

In [None]:
#make sure all factors are the same
print(combined_data['Type Maatregel'].unique())
print(combined_data['Categorie'].unique())
print(len(combined_data['Allocatie'].unique()))


In [None]:
#first one seems fine

#second one required changes
combined_data['Categorie'].replace(to_replace= ['Techniek', 'Veiligheid', 'Tijd&Geld', 'Omgeving', 'Organisatie', 'Kwaliteit', 'Vergunningen', 'Category RIS', 'Contract', 'Mileu', 'Uitvoering', 'Ontwerp', 'Vergunning', 'Planning', 'K&L',
                                                "SLOT's", 'Proces', 'Wetgeving', 'GWW', 'Civiel', 'Opdrachtgever', 'Verkeersmaatregelen', 'Planning en fasering', 'Contractueel', 'Technisch Realisatie', 'Technisch Ontwerp', 'Beheer en onderhoud',
                                                'Geografisch', 'Project Afronding Beheersing', 'Werkvoorbereiding', 'Realisatie', 'Politiek / bestuurlijk', 'Milieu'], 
                                   value=['Technisch', 'Ruimtelijk', 'Financieel', 'Ruimtelijk', 'Organisatorisch', 'Ruimtelijk', 'Politiek', 'Juridisch', 'Juridisch', 'Ruimtelijk', 'Organisatorisch', 'Technisch', 'Politiek', 'Organisatorisch', 'Organisatorisch',
                                          'Ruimtelijk', 'Organisatorisch', 'Juridisch', 'Technisch', 'Technisch' , 'Organisatorisch', 'Maatschappelijk', 'Organisatorisch', 'Juridisch', 'Technisch', 'Technisch', 'Technisch',
                                           'Ruimtelijk', 'Organisatorisch', 'Organisatorisch', 'Technisch', 'Politiek', 'Ruimtelijk' ], inplace=True)

#third requires only small change
combined_data['Allocatie'].replace(to_replace= ['OG/ON', 'ON / OG', 'ON2', 'OG3', 'ON1', 'OG4', 'OG/ ON', 'OG (met impact ON)', 'ON (met impact OG)', 'OG / ON'],
                                   value=['ON/OG', 'ON/OG', 'ON', 'OG', 'ON', 'OG', 'ON/OG', 'ON/OG', 'ON/OG', 'ON/OG'], inplace=True)

#

In [None]:
#following need to be converted to numerical factors
print(combined_data['Kans I'].unique())
print(combined_data['Tijd I'].unique())
print(combined_data['Geld I'].unique())

In [None]:
#lets start converting
#dijk zwolle nog niet want 7 niet 5. Moerdijk 6
combined_data['Kans I'].replace(to_replace= ['Zeer onwaarschijnlijk', 'Kleine kans', 'Kans bestaat, niet groot', 'Reële kans', 'Grote kans', 'None',
                                             'Zeer onwaarschijnlijk (RE)', 'Kleine kans (RE)', 'Kans bestaat, niet groot (RE)', 'Reële kans (RE)', 'Grote kans (RE)', 'Zeer grote kans',
                                             'Onwaarschijnlijk', 'Komt zelden voor', 'Zeer onwaarschijnlijk', 'Kleine kans', 'Geen', 'Zeer groot', 'Zeker', 'Onzekerheid',
                                             'Zeer klein (0-5%)', 'Klein (5-13%)', 'Redelijk (13-25%)', 'Groot (25-50%)', 'Zeer groot (50-100%)', 'Geen (0%)',
                                             'Vrijwel zeker', 'Groot', 'Geringe kans (0-5%)', 'Kleine kans (5-10%)', "Redelijke kans (10-25%)", "Grote kans (25-50%)", "Vrijwel zeker (50-100%)",
                                             'Kan niet optreden', '2) Onwaarschijnlijk', '3) Kans bestaat, niet groot', '4) Reële kans', 'Er is een reëele kans', 'Grote kans, waarschijnlijk', 
                                              '0-1%', '1-10%', '10-25%', '25-50%', '50-100%',
                                              'Bijna onmogelijk (0-1%)', 'Onwaarschijnlijk (1-10%)', 'Reële kans (10-25%)', 'Grote kans (25-50%)', 'Grote kans (>50%)'],
                                 value=[1, 2, 3, 4, 5, 0,
                                        1, 2, 3, 4, 5, 5,
                                        1, 2, 1, 2, 0, 5, 4, 2,
                                        1, 2, 3, 4, 5, 0,
                                        5, 4, 1, 2, 3, 4, 5,
                                        0, 2, 3, 4, 4, 5,
                                        1, 2, 3, 4, 5,
                                        1, 2, 3, 4, 5], inplace=True)


combined_data['Tijd I'].replace(to_replace= ['Zeer klein', 'Klein', 'Matig', 'Gemiddeld', 'Groot', 'Zeer groot', 'None',
                                             'Zeer klein risico', 'Klein risico', 'Gemiddeld risico', 'Groot risico', 'Zeer groot risico', 'Geen',
                                             'Zeer laag risico', 'Laag risico', 'Hoog risico', 'Extreem hoog risico', 'Extreem risico', 'Time', 'Extreem laag risico',
                                             'Laag', 'Medium', 'Hoog', 'Extreem', 'Kleine kans', 'Gemiddelde kans', 'Grote kans', 'Zeer kleine kans', 'Ernstig', 'Kleine kans (- 1-2 weken)',
                                             '< 1 mnd', '1 - 3 mnd', '3 - 6 mnd', '6 -12 mnd',
                                              '0) Geen', '1) < 1 week', '2) 1 week - 1 maand', '3) 1 - 3 maanden', '4) > 3 maanden',
                                              '0 tot 0 wkn' ,'0 tot 1 wkn', '1 tot 3 wkn', '3 tot 9 wkn', '9 tot 27 wkn', '27 tot 81 wkn',
                                              '+ 0 − 1mnd', '+ 1 − 3mnd', '+ 3 − 6mnd', '+ 6 −12 mnd', '+ 6 − 12mnd', '0',
                                              'Geen vertraging', 'Zeer klein risico (+ < 1 week)', 'Klein risico (+ 1- 2 weken)', 'Middelgroot risico (+ 2-4 weken)', 'Groot risico (+ 4-8 weken)', 'Zeer groot risico (+ >= 8 weken)'],
                                 value=[1, 2, 3, 3, 4, 5, 0,
                                        1, 2, 3, 4, 5, 0,
                                        1, 2, 4, 5, 5, 0, 1,
                                        1, 2, 3, 5, -2, -3, -4, -1, 4, -2,
                                        1, 2, 3, 4,
                                        0, 1, 2, 3, 4,
                                        0, 1, 2, 3, 4, 5,
                                        1, 2, 3, 4, 4, 0,
                                        0, 1, 2, 3, 4, 5], inplace=True)

combined_data['Geld I'].replace(to_replace= ['Zeer klein', 'Klein', 'Matig', 'Gemiddeld', 'Groot', 'Zeer groot', 'Zeer Groot', 'None',
                                             'Zeer klein risico', 'Klein risico', 'klein risico', 'Gemiddeld risico', 'Groot risico', 'Zeer groot risico', 'Geen',
                                             'Extreem laag risico', 'Laag risico', 'Hoog risico', 'Zeer hoog risico', 'Extreem risico', 'Heel hoog risico',
                                             'Geen extra kosten', 'Lage extra kosten', 'Marginale extra kosten', 'Redelijke extra kosten', 'Hoge extra kosten', 'Zeer hoge extra kosten', 'Time',
                                             'Laag', 'Medium', 'Hoog', 'Extreem', 'Kleine kans', 'Gemiddelde kans', 'Grote kans', 'Extreme kans',
                                             '< 0.25 mln', '0.25-0.5 mln', '0.5-1 mln', '> 2 mln', 'Zeer kleine kans',
                                             '0) Geen', '1) 0 - 185.000', '2) 185.000 - 925.000', '3) 925.000 - 4.625.000', '4) 4.625.000 - 13.875.000',
                                             '€ 0 tot € 10.000', '€ 10.000 tot € 75.000', '€ 75.000 tot € 200.000', '€ 200.000 tot € 750.000', '€ 750.000 tot € 2.000.000',
                                             '0 − 100k', '100k − 250k', '250k − 500k', '500k − 1mln', '1-2 mln', '0',
                                             '€ 0 - € 100.000 (risico)', '€ 100.000 - € 250.000 (risico)', '€ 250.000 - € 750.000 (risico)', '€ 750.000 - € 1.500.000 (risico)', '> € 1.500.000 (risico)', '€ 250.000 - € 750.000 (kans)'],

                                 value=[1, 2, 3, 3, 4, 5, 5, 0,
                                        1, 2, 2, 3, 4, 5, 0,
                                        1, 2, 4, 5, 5, 5,
                                        0, 1, 2, 3, 4, 5, 0,
                                        1, 2, 3, 5, -1, -2, -4, -5,
                                        1, 2, 3, 5, -1,
                                        0, 1, 2, 3, 4,
                                        1, 2, 3, 4, 5,
                                        1, 2, 3, 4, 5, 0,
                                        1, 2, 3, 4, 5, -3], inplace=True)

combined_data['Kans I'] = combined_data['Kans I'].astype(int)
combined_data['Tijd I'] = combined_data['Tijd I'].astype(int)
combined_data['Geld I'] = combined_data['Geld I'].astype(int)

In [None]:
#creating dimensions

combined_data['text_variables'] = combined_data['Naam'] + ' ' + combined_data['Omschrijving'] + ' ' + combined_data['Oorzaak'] + ' '+ combined_data['Gevolg'] + ' ' + combined_data['Maatregel'] + ' ' + combined_data['Type Maatregel']


#checking for NaNs
print(combined_data['text_variables'].isna().sum())


In [None]:
#remove duplicates for dimension 1
combined_data = combined_data[(combined_data['text_variables'].duplicated() == False)]

In [None]:
combined_data['Project'].unique()

# Vectorisation

# Loading the updated data

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.sparse import hstack
from joblib import parallel_backend

from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
from pattern.nl import lemma
#nltk.download('punkt')


import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

In [None]:
combined_data = pd.read_excel("Info sheets\\input_sub_category_final.xlsx")

In [None]:
#remove duplicates for dimension 1
combined_data = combined_data[(combined_data['text_variables'].duplicated() == False)]

#exclude status Concept and Risico of Kans Kans
combined_data = combined_data[(combined_data['Risico of Kans'] != 'Kans')].reset_index(drop=True)

#select wanted columns
combined_data = combined_data[['Naam', 'Omschrijving', 'Oorzaak', 'Gevolg', 'Maatregel', 'Type Maatregel', 'Categorie', 'Subcategorie', 'Allocatie', 'Project', 'text_variables']]

#remove values if they have missing values in more than 5 columns which are missing values
combined_data = combined_data[(combined_data.isna().sum(axis=1) < 5)].reset_index(drop=True)

#Turn Contract into Juridisch
combined_data['Categorie'].replace(to_replace='Contract', value='Juridisch', inplace=True)

#if the Subcategorie has a '? ' in it, we remove it
combined_data['Subcategorie'] = combined_data['Subcategorie'].str.replace('? ', 'None')
combined_data['Subcategorie'] = combined_data['Subcategorie'].str.replace('?', 'None')

#turning NAs into 'None'
for i in combined_data.columns:
    combined_data[i].fillna('None', inplace=True)


In [None]:
(combined_data['Gevolg'] == 'None').sum()

In [None]:
#percentage of None values in each column
for i in combined_data.columns:
    print(i, (combined_data[i] == 'None').sum()/len(combined_data))



In [None]:
plt.figure(figsize=(20, 10))

# Plot for Allocation
plt.subplot(1, 2, 1)
combined_data['Allocatie'].value_counts().plot(kind='bar', fontsize=15)
plt.xlabel('Allocation', fontsize=15)
plt.ylabel('Number of observations', fontsize=17)

# Plot for Category
plt.subplot(1, 2, 2)
combined_data['Categorie'].value_counts()[:-1].plot(kind='bar', fontsize=15)
plt.xlabel('Category', fontsize=17)


# Adding (a) and (b) titles below the graphs
plt.figtext(0.25, 0.01, '(a) Observations by allocation', ha='center', fontsize=20)
plt.figtext(0.75, 0.01, '(b) Observations by category', ha='center', fontsize=20)

plt.tight_layout(rect=[0, 0.03, 1, 1])
plt.show()



In [None]:
#percentage of observations in category None
print(f"Percentage of observations in category None: {round((combined_data['Categorie'].value_counts()['Technisch'] / len(combined_data)) * 100, 2)}%")
print(f"Percentage of observations in category None: {round((combined_data['Categorie'].value_counts()['Financieel'] / len(combined_data)) * 100, 2)}%")

In [None]:
# Counting the number of unique subcategories for each category
subcat_counts_per_category = combined_data.groupby('Categorie')['Subcategorie'].nunique().reset_index(name='subcat_counts')
category_counts = combined_data.groupby(['Categorie', 'Subcategorie']).size().reset_index(name='counts')

pivot_table = category_counts.pivot(index='Categorie', columns='Subcategorie', values='counts').fillna(0)

# Plot the bar chart with ordered subcategories, without the legend, and indicating the number of subcategories per category
fig, ax = plt.subplots(figsize=(12, 8))
pivot_table.plot(kind='bar', stacked=True, ax=ax, colormap='tab20', legend=False)

# Adding text annotations for the number of subcategories
for idx, row in subcat_counts_per_category.iterrows():
    ax.text(idx, pivot_table.loc[row['Categorie']].sum() + 1, f"{row['subcat_counts']} subcategories", 
            ha='center', va='bottom', fontsize=10)

# Set the labels and title
ax.set_xlabel('Categorie')
ax.set_ylabel('Number of Observations')
ax.set_title('Number of Observations for Each Category and Subcategory (Ordered by Frequency)')

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
#number of subcategories per category

# Counting the number of unique subcategories for each category
combined_data.groupby('Categorie')['Subcategorie'].nunique().reset_index(name='subcat_counts')


In [None]:
#number of observations per subcategory
combined_data['Subcategorie'].value_counts()

## Vectorising

Split according to categories

In [None]:
combined_data['Categorie'].unique()

Maybe one hot encode on sub category and allocation

In [None]:
#Splitting the datasets according to categories
Maatschappelijk_data = combined_data[combined_data['Categorie'] == 'Maatschappelijk']
Organisatorisch_data = combined_data[combined_data['Categorie'] == 'Organisatorisch']
Politiek_data = combined_data[combined_data['Categorie'] == 'Politiek']
Ruimtelijk_data = combined_data[combined_data['Categorie'] == 'Ruimtelijk']
Technisch_data = combined_data[combined_data['Categorie'] == 'Technisch']
Financieel_data = combined_data[combined_data['Categorie'] == 'Financieel']
Juridisch_data = combined_data[combined_data['Categorie'] == 'Juridisch']
unknown_data = combined_data[combined_data['Categorie'] == 'None']


In [None]:
#one hot encode the subcategorie and allocatie columns
Maatschappelijk_data = pd.concat([Maatschappelijk_data, pd.get_dummies(Maatschappelijk_data['Subcategorie'], prefix='Subcategorie').astype(int)], axis=1)
Maatschappelijk_data = pd.concat([Maatschappelijk_data, pd.get_dummies(Maatschappelijk_data['Allocatie'], prefix='Allocatie').astype(int)], axis=1)

Organisatorisch_data = pd.concat([Organisatorisch_data, pd.get_dummies(Organisatorisch_data['Subcategorie'], prefix='Subcategorie').astype(int)], axis=1)
Organisatorisch_data = pd.concat([Organisatorisch_data, pd.get_dummies(Organisatorisch_data['Allocatie'], prefix='Allocatie').astype(int)], axis=1)

Politiek_data = pd.concat([Politiek_data, pd.get_dummies(Politiek_data['Subcategorie'], prefix='Subcategorie').astype(int)], axis=1)
Politiek_data = pd.concat([Politiek_data, pd.get_dummies(Politiek_data['Allocatie'], prefix='Allocatie').astype(int)], axis=1)

Ruimtelijk_data = pd.concat([Ruimtelijk_data, pd.get_dummies(Ruimtelijk_data['Subcategorie'], prefix='Subcategorie').astype(int)], axis=1)
Ruimtelijk_data = pd.concat([Ruimtelijk_data, pd.get_dummies(Ruimtelijk_data['Allocatie'], prefix='Allocatie').astype(int)], axis=1)

Technisch_data = pd.concat([Technisch_data, pd.get_dummies(Technisch_data['Subcategorie'], prefix='Subcategorie').astype(int)], axis=1)
Technisch_data = pd.concat([Technisch_data, pd.get_dummies(Technisch_data['Allocatie'], prefix='Allocatie').astype(int)], axis=1)

Financieel_data = pd.concat([Financieel_data, pd.get_dummies(Financieel_data['Subcategorie'], prefix='Subcategorie').astype(int)], axis=1)
Financieel_data = pd.concat([Financieel_data, pd.get_dummies(Financieel_data['Allocatie'], prefix='Allocatie').astype(int)], axis=1)

Juridisch_data = pd.concat([Juridisch_data, pd.get_dummies(Juridisch_data['Subcategorie'], prefix='Subcategorie').astype(int)], axis=1)
Juridisch_data = pd.concat([Juridisch_data, pd.get_dummies(Juridisch_data['Allocatie'], prefix='Allocatie').astype(int)], axis=1)

unknown_data = pd.concat([unknown_data, pd.get_dummies(unknown_data['Subcategorie'], prefix='Subcategorie').astype(int)], axis=1)
unknown_data = pd.concat([unknown_data, pd.get_dummies(unknown_data['Allocatie'], prefix='Allocatie').astype(int)], axis=1)

In [None]:
Maatschappelijk_data.iloc[:,11:]

Setting tokenizer and hyperparameters

In [None]:
from joblib import parallel_backend
stop_words = set(stopwords.words('dutch'))

def tokenize_and_lemmatize_dutch(text):
    text = text.replace('&', 'en')  # Replace '&' with 'en'
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]  # Remove stopwords
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    tokens = [lemma(token) for token in tokens] # Lemmatize
    return tokens

hyperparameters_grid = {
    'max_df': np.linspace(0.02, 0.15, 5),
    'min_df': np.linspace(0.0, 0.015, 3),
    'ngram_range': [(1, 1), (1, 2), (2, 2)],
    'norm': ['l2', 'l1', None],
    'sublinear_tf': [True, False]
}



#### K means for every category

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

Maatschappelijk

In [None]:
Maatschappelijk_data.iloc[:,11:].values

In [None]:
from joblib import parallel_backend
best_score = 0
best_hyperparameters = None
best_cluster_labels = None

knn_pca_maatschappelijk = Maatschappelijk_data

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Maatschappelijk_data['text_variables'])
                    
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    combined_matrix = hstack([tfidf_matrix, Maatschappelijk_data.iloc[:,11:].values])
                    #apply PCA
                    clustering_matrix = pca.fit_transform(combined_matrix.toarray())

                    # Perform KMeans clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        for k in range(10, 40):
                            kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
                            cluster_labels = kmeans.fit_predict(clustering_matrix)
                            silhouette_avg = silhouette_score(clustering_matrix, cluster_labels)
                            
                            # Update the best score and hyperparameters if the current score is better
                            if silhouette_avg > best_score:
                                best_score = silhouette_avg
                                best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                best_cluster_labels = cluster_labels
                                best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
knn_pca_maatschappelijk['Cluster_Labels'] = best_cluster_labels

# Print the number of clusters and number of elements assigned to each cluster
cluster_ids, cluster_sizes = pd.Series(best_cluster_labels).value_counts().index, pd.Series(best_cluster_labels).value_counts().values
print(f"Number of clusters: {len(cluster_ids)} and number of elements assigned to each cluster: {cluster_sizes}")


politiek

In [None]:
knn_pca_politiek = Politiek_data

best_score = 0
best_hyperparameters = None
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Politiek_data['text_variables'])
                    
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    combined_matrix = hstack([tfidf_matrix, Politiek_data.iloc[:,11:].values])
                    #apply PCA
                    clustering_matrix = pca.fit_transform(combined_matrix.toarray())

                    # Perform KMeans clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        for k in range(10, 40):
                            kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
                            cluster_labels = kmeans.fit_predict(clustering_matrix)
                            silhouette_avg = silhouette_score(clustering_matrix, cluster_labels)
                            
                            # Update the best score and hyperparameters if the current score is better
                            if silhouette_avg > best_score:
                                best_score = silhouette_avg
                                best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                best_cluster_labels = cluster_labels
                                best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
knn_pca_politiek['Cluster_Labels'] = best_cluster_labels

# Print the number of clusters and number of elements assigned to each cluster
cluster_ids, cluster_sizes = pd.Series(best_cluster_labels).value_counts().index, pd.Series(best_cluster_labels).value_counts().values
print(f"Number of clusters: {len(cluster_ids)} and number of elements assigned to each cluster: {cluster_sizes}")


Ruimtelijk

In [None]:
knn_pca_ruimtelijk = Ruimtelijk_data

best_score = 0
best_hyperparameters = None
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Ruimtelijk_data['text_variables'])
                    
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    combined_matrix = hstack([tfidf_matrix, Ruimtelijk_data.iloc[:,11:].values])
                    #apply PCA
                    clustering_matrix = pca.fit_transform(combined_matrix.toarray())

                    # Perform KMeans clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        for k in range(10, 40):
                            kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
                            cluster_labels = kmeans.fit_predict(clustering_matrix)
                            silhouette_avg = silhouette_score(clustering_matrix, cluster_labels)
                            
                            # Update the best score and hyperparameters if the current score is better
                            if silhouette_avg > best_score:
                                best_score = silhouette_avg
                                best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                best_cluster_labels = cluster_labels
                                best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
knn_pca_ruimtelijk['Cluster_Labels'] = best_cluster_labels

# Print the number of clusters and number of elements assigned to each cluster
cluster_ids, cluster_sizes = pd.Series(best_cluster_labels).value_counts().index, pd.Series(best_cluster_labels).value_counts().values
print(f"Number of clusters: {len(cluster_ids)} and number of elements assigned to each cluster: {cluster_sizes}")


Organisatorisch

In [None]:
knn_pca_organisatorisch = Organisatorisch_data

best_score = 0
best_hyperparameters = None
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Organisatorisch_data['text_variables'])
                    
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    combined_matrix = hstack([tfidf_matrix, Organisatorisch_data.iloc[:,11:].values])
                    #apply PCA
                    clustering_matrix = pca.fit_transform(combined_matrix.toarray())

                    # Perform KMeans clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        for k in range(10, 40):
                            kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
                            cluster_labels = kmeans.fit_predict(clustering_matrix)
                            silhouette_avg = silhouette_score(clustering_matrix, cluster_labels)
                            
                            # Update the best score and hyperparameters if the current score is better
                            if silhouette_avg > best_score:
                                best_score = silhouette_avg
                                best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                best_cluster_labels = cluster_labels
                                best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
knn_pca_organisatorisch['Cluster_Labels'] = best_cluster_labels

# Print the number of clusters and number of elements assigned to each cluster
cluster_ids, cluster_sizes = pd.Series(best_cluster_labels).value_counts().index, pd.Series(best_cluster_labels).value_counts().values
print(f"Number of clusters: {len(cluster_ids)} and number of elements assigned to each cluster: {cluster_sizes}")


Juridisch

In [None]:
knn_pca_juridisch = Juridisch_data

best_score = 0
best_hyperparameters = None
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Juridisch_data['text_variables'])
                    
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    combined_matrix = hstack([tfidf_matrix, Juridisch_data.iloc[:,11:].values])
                    #apply PCA
                    clustering_matrix = pca.fit_transform(combined_matrix.toarray())

                    # Perform KMeans clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        for k in range(10, 40):
                            kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
                            cluster_labels = kmeans.fit_predict(clustering_matrix)
                            silhouette_avg = silhouette_score(clustering_matrix, cluster_labels)
                            
                            # Update the best score and hyperparameters if the current score is better
                            if silhouette_avg > best_score:
                                best_score = silhouette_avg
                                best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                best_cluster_labels = cluster_labels
                                best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
knn_pca_juridisch['Cluster_Labels'] = best_cluster_labels

# Print the number of clusters and number of elements assigned to each cluster
cluster_ids, cluster_sizes = pd.Series(best_cluster_labels).value_counts().index, pd.Series(best_cluster_labels).value_counts().values
print(f"Number of clusters: {len(cluster_ids)} and number of elements assigned to each cluster: {cluster_sizes}")


Technisch

In [None]:
knn_pca_technisch = Technisch_data

best_score = 0
best_hyperparameters = None
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Technisch_data['text_variables'])
                    
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    combined_matrix = hstack([tfidf_matrix, Technisch_data.iloc[:,11:].values])
                    #apply PCA
                    clustering_matrix = pca.fit_transform(combined_matrix.toarray())

                    # Perform KMeans clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        for k in range(10, 40):
                            kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
                            cluster_labels = kmeans.fit_predict(clustering_matrix)
                            silhouette_avg = silhouette_score(clustering_matrix, cluster_labels)
                            
                            # Update the best score and hyperparameters if the current score is better
                            if silhouette_avg > best_score:
                                best_score = silhouette_avg
                                best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                best_cluster_labels = cluster_labels
                                best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
knn_pca_technisch['Cluster_Labels'] = best_cluster_labels

# Print the number of clusters and number of elements assigned to each cluster
cluster_ids, cluster_sizes = pd.Series(best_cluster_labels).value_counts().index, pd.Series(best_cluster_labels).value_counts().values
print(f"Number of clusters: {len(cluster_ids)} and number of elements assigned to each cluster: {cluster_sizes}")


Financieel

In [None]:
knn_pca_financieel = Financieel_data

best_score = 0
best_hyperparameters = None
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Financieel_data['text_variables'])
                    
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    combined_matrix = hstack([tfidf_matrix, Financieel_data.iloc[:,11:].values])
                    #apply PCA
                    clustering_matrix = pca.fit_transform(combined_matrix.toarray())

                    # Perform KMeans clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        for k in range(10, 40):
                            kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
                            cluster_labels = kmeans.fit_predict(clustering_matrix)
                            silhouette_avg = silhouette_score(clustering_matrix, cluster_labels)
                            
                            # Update the best score and hyperparameters if the current score is better
                            if silhouette_avg > best_score:
                                best_score = silhouette_avg
                                best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                best_cluster_labels = cluster_labels
                                best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
knn_pca_financieel['Cluster_Labels'] = best_cluster_labels

# Print the number of clusters and number of elements assigned to each cluster
cluster_ids, cluster_sizes = pd.Series(best_cluster_labels).value_counts().index, pd.Series(best_cluster_labels).value_counts().values
print(f"Number of clusters: {len(cluster_ids)} and number of elements assigned to each cluster: {cluster_sizes}")


Unknown

In [None]:
knn_pca_unknown = unknown_data

best_score = 0
best_hyperparameters = None
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(unknown_data['text_variables'])
                    
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    combined_matrix = hstack([tfidf_matrix, unknown_data.iloc[:,11:].values])
                    #apply PCA
                    clustering_matrix = pca.fit_transform(combined_matrix.toarray())

                    # Perform KMeans clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        for k in range(10, 40):
                            kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
                            cluster_labels = kmeans.fit_predict(clustering_matrix)
                            silhouette_avg = silhouette_score(clustering_matrix, cluster_labels)
                            
                            # Update the best score and hyperparameters if the current score is better
                            if silhouette_avg > best_score:
                                best_score = silhouette_avg
                                best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                best_cluster_labels = cluster_labels
                                best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
knn_pca_unknown['Cluster_Labels'] = best_cluster_labels

# Print the number of clusters and number of elements assigned to each cluster
cluster_ids, cluster_sizes = pd.Series(best_cluster_labels).value_counts().index, pd.Series(best_cluster_labels).value_counts().values
print(f"Number of clusters: {len(cluster_ids)} and number of elements assigned to each cluster: {cluster_sizes}")


In [None]:
#for each new dataframe, give a prefix to the cluster labels
knn_pca_maatschappelijk['Cluster_Labels'] = 'Maatschappelijk_' + knn_pca_maatschappelijk['Cluster_Labels'].astype(str)
knn_pca_politiek['Cluster_Labels'] = 'Politiek_' + knn_pca_politiek['Cluster_Labels'].astype(str)
knn_pca_ruimtelijk['Cluster_Labels'] = 'Ruimtelijk_' + knn_pca_ruimtelijk['Cluster_Labels'].astype(str)
knn_pca_organisatorisch['Cluster_Labels'] = 'Organisatorisch_' + knn_pca_organisatorisch['Cluster_Labels'].astype(str)
knn_pca_juridisch['Cluster_Labels'] = 'Juridisch_' + knn_pca_juridisch['Cluster_Labels'].astype(str)
knn_pca_technisch['Cluster_Labels'] = 'Technisch_' + knn_pca_technisch['Cluster_Labels'].astype(str)
knn_pca_financieel['Cluster_Labels'] = 'Financieel_' + knn_pca_financieel['Cluster_Labels'].astype(str)
knn_pca_unknown['Cluster_Labels'] = 'Unknown_' + knn_pca_unknown['Cluster_Labels'].astype(str)

#vertically stack all of them
knn_all = pd.concat([knn_pca_maatschappelijk, knn_pca_politiek, knn_pca_ruimtelijk, knn_pca_organisatorisch, knn_pca_juridisch, knn_pca_technisch, knn_pca_financieel, knn_pca_unknown], axis=0)

In [None]:
#save the dataframe
knn_all.to_excel('Info sheets\\knn_all.xlsx', index=False)

# Trying DBSCAN

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

from sklearn.decomposition import PCA

stop_words = set(stopwords.words('dutch'))

def tokenize_and_lemmatize_dutch(text):
    text = text.replace('&', 'en')  # Replace '&' with 'en'
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]  # Remove stopwords
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    tokens = [lemma(token) for token in tokens] # Lemmatize
    return tokens

hyperparameters_grid = {
    'max_df': np.linspace(0.02, 0.15, 5),
    'min_df': np.linspace(0.0, 0.015, 3),
    'ngram_range': [(1, 1), (1, 2), (2, 2)],
    'norm': ['l2', 'l1', None],
    'sublinear_tf': [True, False]
}

scaler = StandardScaler()
# Define a range of eps and min_samples values to try
eps_range = np.linspace(0.1, 2, 10) 
min_samples_range = [1, 2, 3, 5, 7, 10]  # Adjust the range as needed

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

Maatschappelijk

In [None]:
db_pca_maatschappelijk = Maatschappelijk_data

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Maatschappelijk_data['text_variables'])
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    clustering_matrix = hstack((tfidf_matrix, Maatschappelijk_data.iloc[:,11:-2].values))
                    # Apply PCA to reduce dimensionality for visualization
                    reduced_data = pca.fit_transform(clustering_matrix)
                    # Perform DBSCAN clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        # Iterate over each combination of eps and min_samples
                        for eps in eps_range:
                            for min_samples in min_samples_range:
                                # Perform DBSCAN clustering
                                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                                cluster_labels = dbscan.fit_predict(reduced_data)
                                
                                # Check if any clusters were formed
                                if len(set(cluster_labels)) > 1:  # Need at least 2 clusters for silhouette_score
                                    # Evaluate clustering performance using silhouette score
                                    silhouette_avg = silhouette_score(reduced_data, cluster_labels)
                                    
                                    # Update the best score and parameters if the current score is better
                                    if silhouette_avg > best_score:
                                        best_score = silhouette_avg
                                        best_eps = eps
                                        best_min_samples = min_samples
                                        best_cluster_labels = cluster_labels
                                        best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                        best_pca = reduced_data


# Print the best parameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters}, Best eps: {best_eps}, Best min_samples: {best_min_samples}")
print(f"Best silhouette score: {best_score}")

# Add the best cluster labels to the DataFrame
prefix = "Maatschappelijk_"
db_pca_maatschappelijk['DBSCAN_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")


Politiek

In [None]:
db_pca_politiek = Politiek_data

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Politiek_data['text_variables'])
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    clustering_matrix = hstack((tfidf_matrix, Politiek_data.iloc[:,11:-2].values))
                    # Apply PCA to reduce dimensionality for visualization
                    reduced_data = pca.fit_transform(clustering_matrix)
                    # Perform DBSCAN clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        # Iterate over each combination of eps and min_samples
                        for eps in eps_range:
                            for min_samples in min_samples_range:
                                # Perform DBSCAN clustering
                                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                                cluster_labels = dbscan.fit_predict(reduced_data)
                                
                                # Check if any clusters were formed
                                if len(set(cluster_labels)) > 1:  # Need at least 2 clusters for silhouette_score
                                    # Evaluate clustering performance using silhouette score
                                    silhouette_avg = silhouette_score(reduced_data, cluster_labels)
                                    
                                    # Update the best score and parameters if the current score is better
                                    if silhouette_avg > best_score:
                                        best_score = silhouette_avg
                                        best_eps = eps
                                        best_min_samples = min_samples
                                        best_cluster_labels = cluster_labels
                                        best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                        best_pca = reduced_data


# Print the best parameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters}, Best eps: {best_eps}, Best min_samples: {best_min_samples}")
print(f"Best silhouette score: {best_score}")

# Add the best cluster labels to the DataFrame
prefix = "Politiek_"
db_pca_politiek['DBSCAN_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")


Ruimtelijk

In [None]:
db_pca_ruimtelijk = Ruimtelijk_data

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Ruimtelijk_data['text_variables'])
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    clustering_matrix = hstack((tfidf_matrix, Ruimtelijk_data.iloc[:,11:-2].values))
                    # Apply PCA to reduce dimensionality for visualization
                    reduced_data = pca.fit_transform(clustering_matrix)
                    # Perform DBSCAN clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        # Iterate over each combination of eps and min_samples
                        for eps in eps_range:
                            for min_samples in min_samples_range:
                                # Perform DBSCAN clustering
                                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                                cluster_labels = dbscan.fit_predict(reduced_data)
                                
                                # Check if any clusters were formed
                                if len(set(cluster_labels)) > 1:  # Need at least 2 clusters for silhouette_score
                                    # Evaluate clustering performance using silhouette score
                                    silhouette_avg = silhouette_score(reduced_data, cluster_labels)
                                    
                                    # Update the best score and parameters if the current score is better
                                    if silhouette_avg > best_score:
                                        best_score = silhouette_avg
                                        best_eps = eps
                                        best_min_samples = min_samples
                                        best_cluster_labels = cluster_labels
                                        best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                        best_pca = reduced_data


# Print the best parameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters}, Best eps: {best_eps}, Best min_samples: {best_min_samples}")
print(f"Best silhouette score: {best_score}")

# Add the best cluster labels to the DataFrame
prefix = "Ruimtelijk_"
db_pca_ruimtelijk['DBSCAN_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")


Organisatorisch

In [None]:
db_pca_organisatorisch = Organisatorisch_data

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Organisatorisch_data['text_variables'])
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    clustering_matrix = hstack((tfidf_matrix, Organisatorisch_data.iloc[:,11:-2].values))
                    # Apply PCA to reduce dimensionality for visualization
                    reduced_data = pca.fit_transform(clustering_matrix)
                    # Perform DBSCAN clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        # Iterate over each combination of eps and min_samples
                        for eps in eps_range:
                            for min_samples in min_samples_range:
                                # Perform DBSCAN clustering
                                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                                cluster_labels = dbscan.fit_predict(reduced_data)
                                
                                # Check if any clusters were formed
                                if len(set(cluster_labels)) > 1:  # Need at least 2 clusters for silhouette_score
                                    # Evaluate clustering performance using silhouette score
                                    silhouette_avg = silhouette_score(reduced_data, cluster_labels)
                                    
                                    # Update the best score and parameters if the current score is better
                                    if silhouette_avg > best_score:
                                        best_score = silhouette_avg
                                        best_eps = eps
                                        best_min_samples = min_samples
                                        best_cluster_labels = cluster_labels
                                        best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                        best_pca = reduced_data


# Print the best parameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters}, Best eps: {best_eps}, Best min_samples: {best_min_samples}")
print(f"Best silhouette score: {best_score}")

# Add the best cluster labels to the DataFrame
prefix = "Organisatorisch_"
db_pca_organisatorisch['DBSCAN_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")


Juridisch

In [None]:
db_pca_juridisch = Juridisch_data

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Juridisch_data['text_variables'])
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    clustering_matrix = hstack((tfidf_matrix, Juridisch_data.iloc[:,11:-2].values))
                    # Apply PCA to reduce dimensionality for visualization
                    reduced_data = pca.fit_transform(clustering_matrix)
                    # Perform DBSCAN clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        # Iterate over each combination of eps and min_samples
                        for eps in eps_range:
                            for min_samples in min_samples_range:
                                # Perform DBSCAN clustering
                                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                                cluster_labels = dbscan.fit_predict(reduced_data)
                                
                                # Check if any clusters were formed
                                if len(set(cluster_labels)) > 1:  # Need at least 2 clusters for silhouette_score
                                    # Evaluate clustering performance using silhouette score
                                    silhouette_avg = silhouette_score(reduced_data, cluster_labels)
                                    
                                    # Update the best score and parameters if the current score is better
                                    if silhouette_avg > best_score:
                                        best_score = silhouette_avg
                                        best_eps = eps
                                        best_min_samples = min_samples
                                        best_cluster_labels = cluster_labels
                                        best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                        best_pca = reduced_data


# Print the best parameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters}, Best eps: {best_eps}, Best min_samples: {best_min_samples}")
print(f"Best silhouette score: {best_score}")

# Add the best cluster labels to the DataFrame
prefix = "Juridisch_"
db_pca_juridisch['DBSCAN_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")


Technisch

In [None]:
db_pca_technisch = Technisch_data

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Technisch_data['text_variables'])
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    clustering_matrix = hstack((tfidf_matrix, Technisch_data.iloc[:,11:-2].values))
                    # Apply PCA to reduce dimensionality for visualization
                    reduced_data = pca.fit_transform(clustering_matrix)
                    # Perform DBSCAN clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        # Iterate over each combination of eps and min_samples
                        for eps in eps_range:
                            for min_samples in min_samples_range:
                                # Perform DBSCAN clustering
                                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                                cluster_labels = dbscan.fit_predict(reduced_data)
                                
                                # Check if any clusters were formed
                                if len(set(cluster_labels)) > 1:  # Need at least 2 clusters for silhouette_score
                                    # Evaluate clustering performance using silhouette score
                                    silhouette_avg = silhouette_score(reduced_data, cluster_labels)
                                    
                                    # Update the best score and parameters if the current score is better
                                    if silhouette_avg > best_score:
                                        best_score = silhouette_avg
                                        best_eps = eps
                                        best_min_samples = min_samples
                                        best_cluster_labels = cluster_labels
                                        best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                        best_pca = reduced_data


# Print the best parameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters}, Best eps: {best_eps}, Best min_samples: {best_min_samples}")
print(f"Best silhouette score: {best_score}")

# Add the best cluster labels to the DataFrame
prefix = "Technisch_"
db_pca_technisch['DBSCAN_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")


Financieel

In [None]:
db_pca_financieel = Financieel_data

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(Financieel_data['text_variables'])
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    clustering_matrix = hstack((tfidf_matrix, Financieel_data.iloc[:,11:-2].values))
                    # Apply PCA to reduce dimensionality for visualization
                    reduced_data = pca.fit_transform(clustering_matrix)
                    # Perform DBSCAN clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        # Iterate over each combination of eps and min_samples
                        for eps in eps_range:
                            for min_samples in min_samples_range:
                                # Perform DBSCAN clustering
                                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                                cluster_labels = dbscan.fit_predict(reduced_data)
                                
                                # Check if any clusters were formed
                                if len(set(cluster_labels)) > 1:  # Need at least 2 clusters for silhouette_score
                                    # Evaluate clustering performance using silhouette score
                                    silhouette_avg = silhouette_score(reduced_data, cluster_labels)
                                    
                                    # Update the best score and parameters if the current score is better
                                    if silhouette_avg > best_score:
                                        best_score = silhouette_avg
                                        best_eps = eps
                                        best_min_samples = min_samples
                                        best_cluster_labels = cluster_labels
                                        best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                        best_pca = reduced_data


# Print the best parameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters}, Best eps: {best_eps}, Best min_samples: {best_min_samples}")
print(f"Best silhouette score: {best_score}")

# Add the best cluster labels to the DataFrame
prefix = "Financieel_"
db_pca_financieel['DBSCAN_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")


unknown

In [None]:
db_pca_unknown = unknown_data

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

# Iterate over each hyperparameter combination
for max_df in hyperparameters_grid['max_df']:
    for min_df in hyperparameters_grid['min_df']:
        for ngram_range in hyperparameters_grid['ngram_range']:
            for norm in hyperparameters_grid['norm']:
                for sublinear_tf in hyperparameters_grid['sublinear_tf']:
                    # Initialize TfidfVectorizer
                    tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize_dutch, ngram_range=ngram_range, max_df=max_df, min_df=min_df, norm=norm, sublinear_tf=sublinear_tf)
                    # Fit and transform the text data
                    tfidf_matrix = tfidf_vectorizer.fit_transform(unknown_data['text_variables'])
                    # Combine the TF-IDF matrix with the dummy variables created fro category and allocatie
                    clustering_matrix = hstack((tfidf_matrix, unknown_data.iloc[:,11:-2].values))
                    # Apply PCA to reduce dimensionality for visualization
                    reduced_data = pca.fit_transform(clustering_matrix)
                    # Perform DBSCAN clustering with parallel processing
                    with parallel_backend('threading', n_jobs=-2):
                        # Iterate over each combination of eps and min_samples
                        for eps in eps_range:
                            for min_samples in min_samples_range:
                                # Perform DBSCAN clustering
                                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                                cluster_labels = dbscan.fit_predict(reduced_data)
                                
                                # Check if any clusters were formed
                                if len(set(cluster_labels)) > 1:  # Need at least 2 clusters for silhouette_score
                                    # Evaluate clustering performance using silhouette score
                                    silhouette_avg = silhouette_score(reduced_data, cluster_labels)
                                    
                                    # Update the best score and parameters if the current score is better
                                    if silhouette_avg > best_score:
                                        best_score = silhouette_avg
                                        best_eps = eps
                                        best_min_samples = min_samples
                                        best_cluster_labels = cluster_labels
                                        best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
                                        best_pca = reduced_data


# Print the best parameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters}, Best eps: {best_eps}, Best min_samples: {best_min_samples}")
print(f"Best silhouette score: {best_score}")

# Add the best cluster labels to the DataFrame
prefix = "unknown_"
db_pca_unknown['DBSCAN_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")


In [None]:
#merge all the dataframes
db_all = pd.concat([db_pca_maatschappelijk, db_pca_politiek, db_pca_ruimtelijk, db_pca_organisatorisch, db_pca_juridisch, db_pca_technisch, db_pca_financieel, db_pca_unknown], axis=0)

#save the dataframe
db_all.to_excel('Info sheets\\db_all.xlsx', index=False)

distances

In [None]:
# Calculate the distance from the origin for each point
distances = np.linalg.norm(best_pca, axis=1)

# Identify the 8 points that are farthest from the origin
outlier_indices = np.argsort(distances)[-13:]

# Filter out the outliers
filtered_indices = np.setdiff1d(np.arange(best_pca.shape[0]), outlier_indices)
filtered_data = reduced_data[filtered_indices]
filtered_labels = np.array(best_cluster_labels)[filtered_indices]


# Plot the filtered data
plt.figure(figsize=(10, 6))
unique_labels = np.unique(filtered_labels)
colors = plt.cm.get_cmap('viridis', len(unique_labels))

for i, label in enumerate(unique_labels):
    plt.scatter(filtered_data[filtered_labels == label, 0], 
                filtered_data[filtered_labels == label, 1], 
                c=[colors(i)], label=f"Cluster {label}")

plt.title('PCA Visualization of Clusters (Filtered)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

In [None]:
#sort the data by cluster
#combined_data.sort_values(by='Cluster_Labels', inplace=True)

# Embeddings

In [None]:
!C:\Users\leen\AppData\Local\anaconda3\python.exe -m pip install sentence_transformers


In [None]:
from sentence_transformers import SentenceTransformer

# Load the sentence transformer model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

pca = PCA(n_components=2)

Maatschappelijk

In [None]:
embed_pca_db_maatschappelijk = Maatschappelijk_data.copy()

# Generate sentence embeddings for the text variables
sentence_embeddings_maatschappelijk = model.encode(Maatschappelijk_data['text_variables'].tolist())

# Combine the sentence embeddings with the other features (assuming they start from column 11)
clustering_matrix = np.concatenate((sentence_embeddings_maatschappelijk, Maatschappelijk_data.iloc[:,11:-1].values), axis=1)

# Apply PCA to reduce dimensionality for visualization
reduced_data = pca.fit_transform(clustering_matrix)

best_k = None
best_hyperparameters = None
best_score = -1
best_cluster_labels = None



with parallel_backend('threading', n_jobs=-2):
    for k in range(10, 40):
        kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
        cluster_labels = kmeans.fit_predict(reduced_data)
        silhouette_avg = silhouette_score(reduced_data, cluster_labels)
        
        # Update the best score and hyperparameters if the current score is better
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
            best_cluster_labels = cluster_labels
            best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
prefix = "Maatschappelijk_"
embed_pca_db_maatschappelijk['embed_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")

In [None]:
#plot reduced data
plt.figure(figsize=(10, 6))

plt.scatter(reduced_data[:,0], reduced_data[:,1])

plt.title('PCA Visualization of Clusters (Filtered)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

Politiek

In [None]:
embed_pca_db_politiek = Politiek_data.copy()    

# Generate sentence embeddings for the text variables
sentence_embeddings_politiek = model.encode(Politiek_data['text_variables'].tolist())

# Combine the sentence embeddings with the other features (assuming they start from column 11)
clustering_matrix = np.concatenate((sentence_embeddings_politiek, Politiek_data.iloc[:,11:-1].values), axis=1)

# Apply PCA to reduce dimensionality for visualization
reduced_data = pca.fit_transform(clustering_matrix)

best_k = None
best_hyperparameters = None
best_score = -1
best_cluster_labels = None



with parallel_backend('threading', n_jobs=-2):
    for k in range(10, 40):
        kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
        cluster_labels = kmeans.fit_predict(reduced_data)
        silhouette_avg = silhouette_score(reduced_data, cluster_labels)
        
        # Update the best score and hyperparameters if the current score is better
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
            best_cluster_labels = cluster_labels
            best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
prefix = "Politiek_"
embed_pca_db_politiek['embed_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")

Ruimtelijk

In [None]:
embed_pca_db_ruimtelijk = Ruimtelijk_data.copy()

# Generate sentence embeddings for the text variables
sentence_embeddings_ruimtelijk = model.encode(Ruimtelijk_data['text_variables'].tolist())

# Combine the sentence embeddings with the other features (assuming they start from column 11)
clustering_matrix = np.concatenate((sentence_embeddings_ruimtelijk, Ruimtelijk_data.iloc[:,11:-1].values), axis=1)

# Apply PCA to reduce dimensionality for visualization
reduced_data = pca.fit_transform(clustering_matrix)

best_k = None
best_hyperparameters = None
best_score = -1
best_cluster_labels = None



with parallel_backend('threading', n_jobs=-2):
    for k in range(10, 40):
        kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
        cluster_labels = kmeans.fit_predict(reduced_data)
        silhouette_avg = silhouette_score(reduced_data, cluster_labels)
        
        # Update the best score and hyperparameters if the current score is better
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
            best_cluster_labels = cluster_labels
            best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
prefix = "Ruimtelijk_"
embed_pca_db_ruimtelijk['embed_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")

Organisatorisch

In [None]:
embed_pca_db_organisatorisch = Organisatorisch_data.copy()

# Generate sentence embeddings for the text variables
sentence_embeddings_organisatorisch = model.encode(Organisatorisch_data['text_variables'].tolist())

# Combine the sentence embeddings with the other features (assuming they start from column 11)
clustering_matrix = np.concatenate((sentence_embeddings_organisatorisch, Organisatorisch_data.iloc[:,11:-1].values), axis=1)

# Apply PCA to reduce dimensionality for visualization
reduced_data = pca.fit_transform(clustering_matrix)

best_eps = None
best_min_samples = None
best_score = -1
best_cluster_labels = None

best_k = None
best_hyperparameters = None
best_score = -1
best_cluster_labels = None



with parallel_backend('threading', n_jobs=-2):
    for k in range(10, 40):
        kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
        cluster_labels = kmeans.fit_predict(reduced_data)
        silhouette_avg = silhouette_score(reduced_data, cluster_labels)
        
        # Update the best score and hyperparameters if the current score is better
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
            best_cluster_labels = cluster_labels
            best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
prefix = "Organisatorisch_"
embed_pca_db_organisatorisch['embed_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")

Juridisch

In [None]:
embed_pca_db_juridisch = Juridisch_data.copy()

# Generate sentence embeddings for the text variables
sentence_embeddings_juridisch = model.encode(Juridisch_data['text_variables'].tolist())

# Combine the sentence embeddings with the other features (assuming they start from column 11)
clustering_matrix = np.concatenate((sentence_embeddings_juridisch, Juridisch_data.iloc[:,11:-1].values), axis=1)

# Apply PCA to reduce dimensionality for visualization
reduced_data = pca.fit_transform(clustering_matrix)

best_k = None
best_hyperparameters = None
best_score = -1
best_cluster_labels = None



with parallel_backend('threading', n_jobs=-2):
    for k in range(10, 40):
        kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
        cluster_labels = kmeans.fit_predict(reduced_data)
        silhouette_avg = silhouette_score(reduced_data, cluster_labels)
        
        # Update the best score and hyperparameters if the current score is better
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
            best_cluster_labels = cluster_labels
            best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
prefix = "Juridisch_"
embed_pca_db_juridisch['embed_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")

Technisch

In [None]:
embed_pca_db_technisch = Technisch_data.copy()

# Generate sentence embeddings for the text variables
sentence_embeddings_technisch = model.encode(Technisch_data['text_variables'].tolist())

# Combine the sentence embeddings with the other features (assuming they start from column 11)
clustering_matrix = np.concatenate((sentence_embeddings_technisch, Technisch_data.iloc[:,11:-1].values), axis=1)

# Apply PCA to reduce dimensionality for visualization
reduced_data = pca.fit_transform(clustering_matrix)

best_k = None
best_hyperparameters = None
best_score = -1
best_cluster_labels = None



with parallel_backend('threading', n_jobs=-2):
    for k in range(10, 40):
        kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
        cluster_labels = kmeans.fit_predict(reduced_data)
        silhouette_avg = silhouette_score(reduced_data, cluster_labels)
        
        # Update the best score and hyperparameters if the current score is better
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
            best_cluster_labels = cluster_labels
            best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
prefix = "Technisch_"
embed_pca_db_technisch['embed_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")

Financieel

In [None]:
embed_pca_db_financieel = Financieel_data.copy()

# Generate sentence embeddings for the text variables
sentence_embeddings_financieel = model.encode(Financieel_data['text_variables'].tolist())

# Combine the sentence embeddings with the other features (assuming they start from column 11)
clustering_matrix = np.concatenate((sentence_embeddings_financieel, Financieel_data.iloc[:,11:-2].values), axis=1)

# Apply PCA to reduce dimensionality for visualization
reduced_data = pca.fit_transform(clustering_matrix)

best_k = None
best_hyperparameters = None
best_score = -1
best_cluster_labels = None



with parallel_backend('threading', n_jobs=-2):
    for k in range(10, 40):
        kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
        cluster_labels = kmeans.fit_predict(reduced_data)
        silhouette_avg = silhouette_score(reduced_data, cluster_labels)
        
        # Update the best score and hyperparameters if the current score is better
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
            best_cluster_labels = cluster_labels
            best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
prefix = "Financieel_"
embed_pca_db_financieel['embed_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")

unknown

In [None]:
embed_pca_db_unknown = unknown_data.copy()

# Generate sentence embeddings for the text variables
sentence_embeddings_unknown = model.encode(unknown_data['text_variables'].tolist())

# Combine the sentence embeddings with the other features (assuming they start from column 11)
clustering_matrix = np.concatenate((sentence_embeddings_unknown, unknown_data.iloc[:,11:-2].values), axis=1)

# Apply PCA to reduce dimensionality for visualization
reduced_data = pca.fit_transform(clustering_matrix)

best_k = None
best_hyperparameters = None
best_score = -1
best_cluster_labels = None



with parallel_backend('threading', n_jobs=-2):
    for k in range(10, 40):
        kmeans = KMeans(n_clusters=k, random_state=7, n_init=10, max_iter=100)
        cluster_labels = kmeans.fit_predict(reduced_data)
        silhouette_avg = silhouette_score(reduced_data, cluster_labels)
        
        # Update the best score and hyperparameters if the current score is better
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_hyperparameters = {'max_df': max_df, 'min_df': min_df, 'ngram_range': ngram_range, 'norm': norm, 'sublinear_tf': sublinear_tf}
            best_cluster_labels = cluster_labels
            best_k = k

# Print the best hyperparameters and clustering score
print(f"Best hyperparameters: {best_hyperparameters} and best number of clusters: {best_k}") 
print(f"Best silhouette score: {best_score}") 

# Add the best cluster labels to the DataFrame
prefix = "unknown_"
embed_pca_db_unknown['embed_Cluster_Labels'] = [f"{prefix}{label}" for label in best_cluster_labels]

# Print the number of clusters and number of elements assigned to each cluster
unique_labels, counts = np.unique(best_cluster_labels, return_counts=True)
print(f"Number of clusters: {len(unique_labels)} and number of elements assigned to each cluster: {counts}")

In [None]:
#merge them
embed_all = pd.concat([embed_pca_db_maatschappelijk, embed_pca_db_politiek, embed_pca_db_ruimtelijk, embed_pca_db_organisatorisch, embed_pca_db_juridisch, embed_pca_db_technisch, embed_pca_db_financieel, embed_pca_db_unknown], axis=0)

In [None]:
embed_all.to_excel('Info sheets\\embed_all.xlsx', index=False)

# Saving the data

In [None]:
import pickle 
# Save the model to a 
with open('kmeans_model.pkl', 'wb') as file:     
    pickle.dump(kmeans, file)