<a href="https://colab.research.google.com/github/MarinaOhm/master_thesis/blob/main/RICE_CLUSTERING_SCRIPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Master thesis
## *Clustering for product portfolio management*
*Developed by Max Hedeman Gueniau, Niklas Madsen, and Marina Ohm*

# Install and import libraries

## Installing libraries

In [None]:
!pip install fuzzy-c-means
!pip install --upgrade scikit-learn
!pip install hdbscan
!pip install umap
!pip install umap-learn
!pip install Counter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzy-c-means
  Downloading fuzzy_c_means-1.7.0-py3-none-any.whl (9.0 kB)
Collecting typer<0.5.0,>=0.4.0 (from fuzzy-c-means)
  Downloading typer-0.4.2-py3-none-any.whl (27 kB)
Installing collected packages: typer, fuzzy-c-means
  Attempting uninstall: typer
    Found existing installation: typer 0.7.0
    Uninstalling typer-0.7.0:
      Successfully uninstalled typer-0.7.0
Successfully installed fuzzy-c-means-1.7.0 typer-0.4.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hdbscan
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirem

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import random
import hdbscan
import plotly.express as px
import umap
import nltk
import string
import collections
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.model_selection import cross_val_score
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from wordcloud import WordCloud
from plotly.subplots import make_subplots
from google.colab import drive
from fcmeans import FCM
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Clustering with PCA


In [None]:
def main():
    """
    Main function for clustering analysis, including all steps necessary.
    Change path to data and allow access to Gdrive for the function to execute.

    :parameters: 
    - None
    
    :return: 
    - hca_means: Cluster means from HCA clustering
    - hca_df: Original DataFrame with cluster labels from HCA attached
    - hca_ss: HCA clustering silhouette score
    - hdbscan_means: Cluster means from HDBSCAN clustering
    - hdbscan_df: Original DataFrame with cluster labels from HDBSCAN attached
    - hdbscan_ss: Cluster means from HDBSCAN clustering
    """

    # Setting seed for reproducability 
    random.seed(42)
    
    # Load and preprocess data from Gdrive. Path to folder should be adjusted
    drive.mount('/content/drive')
    df = pd.read_csv('/content/drive/MyDrive/Master Thesis/final_updated_clustering_data.csv') 

    # Preprocess data using the dedicated preprocess function
    df_preprocessed, scaler = preprocess(df)
    
    # Find PCA components that encapsulates >90% of the variance in the data and PCA transform accordingly
    df_transformed, pca= perform_pca(df_preprocessed)

    # Perform HCA clustering on PCA transformed data
    hca_means, hca_df, hca_ss = perform_hca_clustering(df_transformed, pca, scaler, df_preprocessed)

    # Perform DBSCAN clustering on PCA transformed data
    hdbscan_means, hdbscan_df, hdbscan_ss = perform_hdbscan_clustering(df_transformed, pca, scaler, df_preprocessed)

    # Chosen model only: Examine output dataframe through descriptive stastistics
    descriptive_clust(hdbscan_df)

    return hca_means, hca_df, hca_ss, hdbscan_means, hdbscan_df, hdbscan_ss


def text_preprocess(df):
    """
    Preprocessing function for textual data to extract additional information through clustering

    :parameters:
    - Dataframe with column '_description' containing textual data

    :return: 
    - DataFrame with an additional column containing text cluster labels and wordclouds for visualization of the data
    """

    # Define constants
    min_cluster_sizes = range(10, 15) # hp range to tune
    n_neighbors = range(10, 15) # hp range to tune
    best_params = {} # for storing best hp
    best_score = -1 # for storing the silhouette score for the best hp combination

    # Initiate objects to be used in the function
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    # Tokenize text, remove punctuation and stopwords, and stem words
    df['_description'] = df['_description'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower().split())
    df['_description'] = df['_description'].apply(lambda x: [word for word in x if not word in stop_words])
    df['_description'] = df['_description'].apply(lambda x: [stemmer.stem(word) for word in x])

    # Perform TF-IDF weighting on the text columns to weight words according to their appearances
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_data = tfidf_vectorizer.fit_transform(df['_description'].apply(lambda x: " ".join(x)))

    # Iterate over all hp combinations using a for loop
    for min_cluster_size in min_cluster_sizes:
        for n in n_neighbors:

            # Dimensionality reduction using UMAP
            umap_model = umap.UMAP(n_neighbors=n, min_dist=0.5, metric='euclidean', random_state=42)
            umap_data = umap_model.fit_transform(tfidf_data)

            # Clustering using HDBSCAN
            clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
            cluster_labels = clusterer.fit_predict(umap_data)

            # Calculate silhouette score
            score = silhouette_score(umap_data, cluster_labels)

            # Compare scores and update parameters if score is higher than current
            if score > best_score:
                best_score = score
                best_params['min_cluster_size'] = min_cluster_size
                best_params['n_neighbors'] = n

    # Use the best parameters to update the code
    umap_model = umap.UMAP(n_neighbors=best_params['n_neighbors'], min_dist=0.5, metric='euclidean', random_state=42)
    umap_data = umap_model.fit_transform(tfidf_data)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=best_params['min_cluster_size'])
    cluster_labels = clusterer.fit_predict(umap_data)
    df['cluster'] = cluster_labels

    # Get number of observations in each cluster
    cluster_counts = pd.Series(cluster_labels).value_counts()
    print(cluster_counts)
    
    # Extract 10 most common words for each cluster + create wordcloud for each cluster with 50 most common words
    for label in df['cluster'].unique():

        print(f"Cluster {label}:")
        cluster_df = df.loc[df['cluster'] == label]
        words = [word for doc in cluster_df['_description'] for word in doc]

        # Find 10 most common words
        word_counts = collections.Counter(words)
        top_10_words = word_counts.most_common(10)
        print('Top 10 words in clusters:')
        print(top_10_words)

        # Create word cloud
        print(f'Wordcloud for cluster {label}')
        word_counts = collections.Counter(words)
        top_50_words = word_counts.most_common(50)
        top_words_dict = dict(top_50_words)
        wordcloud = WordCloud(width = 800, height = 800, background_color ='white', stopwords = stop_words, min_font_size = 10).generate_from_frequencies(top_words_dict)

        # Plot word cloud
        plt.figure(figsize = (8, 8), facecolor = None)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad = 0)
        plt.show()
    
    return df


def preprocess(df):
    """
    Dedicated preprocessing function for handling numerical, categorical, and textual data (:using dedicated text_preprocessing function)

    :parameters:
    - DataFrame used for clustering

    :return:
    - Preprocessed DataFrame
    - Scaler object from MinMaxScaler for later inverting
    """

    # Initiate objects
    scaler = MinMaxScaler()

    # Removing all -inf profit_margins since this these are percieved data errors
    df = df[(df['profit_margin'] >= 0) & (df['profit_margin'] != float('inf')) & (df['profit_margin'] != float('-inf'))]

    # Dropping unnecessary columns
    exclude_columns = ["Unnamed: 0","BDLRI Sales Status"]
    df = df.loc[:, ~df.columns.isin(exclude_columns)]

    # One-hot encode categorical columns
    df = pd.concat([df, pd.get_dummies(df['Country_Region of Origin Code'], prefix='Country')], axis=1).drop('Country_Region of Origin Code', axis=1)

    # Preprocess textual data using pre-defined function
    df = text_preprocess(df)

    # One-got encode the newly added cluster_text column from the text_preprocess function
    df = pd.concat([df, pd.get_dummies(df['cluster'], prefix='TC')], axis=1).drop('cluster', axis=1)

    # Scale numerical values
    df[['average_monthly_sales','average_order_size','average_value_by_order','unique_customer_per_item','average_monthly_quantity','profit_margin','Net Weight','Reorder Quantity','Unit Cost','First Purch Order Quantum','average_days_between_sales','Customer_life_time','average_monthly_sales_growth','sales_variability']] = scaler.fit_transform(df[['average_monthly_sales','average_order_size','average_value_by_order','unique_customer_per_item','average_monthly_quantity','profit_margin','Net Weight','Reorder Quantity','Unit Cost','First Purch Order Quantum','average_days_between_sales','Customer_life_time','average_monthly_sales_growth','sales_variability']])

    # Remove and handle nan rows
    df = df.dropna(subset=['_ItemNumber'])
    df = df.fillna(0)
    df = df.set_index('_ItemNumber')
    df.columns = df.columns.astype(str)

    # Dropping _description as information have been extracted from it through the text_preprocessing function
    df = df.drop(columns=['_description'])

    return df, scaler


def perform_pca(df_scaled):
    """
    Perform PCA on a preprocessed DataFrame and return PCA transformed DataFrame and PCA model

    :parameters:
    - Preprocessed DataFrame 

    :return: 
    - DataFrame with transformed data
    - PCA model

    """

    # Fit PCA model and plot cumulative explained variance
    pca = PCA().fit(df_scaled)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of components')
    plt.ylabel('Cumulative explained variance')
    plt.show()

    # Find the appropriate number of components that explains at least 90% of variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(cumulative_variance >= 0.90) + 1 
    
    # Fit PCA model with optimal n_components to transform the data
    pca = PCA(n_components=n_components).fit(df_scaled)
    pca_transformed_data = pca.transform(df_scaled)
    df_transformed = pd.DataFrame(pca_transformed_data, columns=[f'PC{i}' for i in range(1, n_components+1)])

    # Print the explained variance for all principal components
    print('Explained variance for all principal components:')
    print(pca.explained_variance_ratio_)

    # Print the cumulative explained variance for all principal components
    print('Cumulative explained variance for all principal components:')
    print(np.cumsum(pca.explained_variance_ratio_))

    # Calculate loadings and feature importance for each principal component
    loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i}' for i in range(1, n_components+1)], index=df_scaled.columns)
    abs_sum_loadings = loadings.abs().sum(axis=1).sort_values(ascending=False)
 
    # Define the color to use in chart
    color = 'rgba(128, 128, 128, 1)'

    # Print the top three important features for each PC
    for i in range(1, n_components+1):
        print(f'Top three important features for PC{i}:')
        print(loadings[f'PC{i}'].abs().nlargest(5))

    # Create a vertical bar chart with the most important feature on the left
    fig = go.Figure(go.Bar(
        x=abs_sum_loadings.values[::-1],
        y=abs_sum_loadings.index[::-1],
        orientation='h',
        marker_color=color
    ))

    # Set the chart title and axis labels
    fig.update_layout(
        title='Feature Importance',
        xaxis_title='Absolute Sum of Loadings',
        yaxis_title='Features',
        plot_bgcolor='white',
        font=dict(size=11, color='black', family='Arial'),
        width=1200, 
        height=600 
    )

    fig.show()

    return df_transformed, pca


def perform_hca_clustering(df_transformed, pca, scaler, original_df):
    """
    Perform HCA on a preprocessed and PCA transformed DataFrame. After clustering, PCA transformation and scaling will be reverted to get true mean values for clusters

    :parameters:
    - Preprocessed and PCA transformed dataframe
    - PCA model
    - MinMaxScaler object
    - Original DataFrame

    :return:
    - HCA_means: Cluster means from HCA clustering
    - HCA_df: Original DataFrame with cluster labels from HCA attached
    - hca_silhouette_score: Cluster means from HCA clustering
    """
    print('\n')
    print(' --------------- HIERARCHICAL CLUSTERING ---------------')

    # Set seed for reproducability 
    random.seed(42)

    # Define range of clusters to evaluate
    n_clusters_range = range(2, 6)

    # Calculate the silhouette score for each number of clusters and store them in scores list
    scores = []

    # Loop over each number of clusters, perform HCA, and attach silhouette score to scores list
    for n_clusters in n_clusters_range:
        hca = AgglomerativeClustering(n_clusters=n_clusters)
        hca.fit(df_transformed)
        labels = hca.labels_
        score = silhouette_score(df_transformed, labels)
        scores.append(score)

    # Plot the silhouette scores across number of clusters to examine closer
    plt.plot(n_clusters_range, scores)
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette score')
    plt.show()

    # Choose the number of clusters with the highest silhouette score
    best_n_clusters = n_clusters_range[np.argmax(scores)]
    print(f"Best number of clusters: {best_n_clusters}")

    # Perform hierarchical clustering with best_n_clusters
    HCA_clustering = AgglomerativeClustering(n_clusters=best_n_clusters)
    HCA_clustering.fit(df_transformed)

    # Remove PCA transformation to interpret means of clusters
    df_inv = pca.inverse_transform(df_transformed)

    # Create a dataframe with the inverse PCA transformation
    df_inv_pca = pd.DataFrame(df_inv, columns=original_df.columns)

    # Revert MinMax scaling on the relevant columns
    columns_of_interest = ['average_monthly_sales','average_order_size','average_value_by_order','unique_customer_per_item','average_monthly_quantity','profit_margin','Net Weight','Reorder Quantity','Unit Cost','First Purch Order Quantum','average_days_between_sales','Customer_life_time','average_monthly_sales_growth','sales_variability']
    df_inv_pca[columns_of_interest] = scaler.inverse_transform(df_inv_pca[columns_of_interest])

    # Creating a dataframe with the cluster numbers attached and selecting columns of interest
    HCA_df = df_inv_pca[columns_of_interest].copy()
    HCA_df.loc[:, 'cluster'] = HCA_clustering.labels_
    HCA_df.loc[:, 'item_number'] = original_df.index

    # Group by cluster and compute mean values for selected columns of interest
    HCA_means = HCA_df.groupby('cluster')[columns_of_interest[1:]].mean()

    # Calculate final silhouette score for the best HCA clustering
    hca_silhouette_score = silhouette_score(df_transformed, HCA_clustering.labels_)

    # Print the print cluster means and HCA silhoette score
    print('/n')
    print('Cluster means:')
    print(HCA_means)
    print('/n')
    print('Silhouette score:')
    print(hca_silhouette_score)

    return HCA_means, HCA_df, hca_silhouette_score



def perform_hdbscan_clustering(df_transformed, pca, scaler, original_df):
    """
    Perform HDBSCAN on a preprocessed and PCA transformed DataFrame. After clustering, PCA transformation and scaling will be reverted to get true mean values for clusters

    :parameters:
    - Preprocessed and PCA transformed dataframe
    - PCA model
    - MinMaxScaler object
    - Original DataFrame

    :return:
    - hdbscan_means: Cluster means from HDBSCAN clustering
    - hdbscan_df: Original DataFrame with cluster labels from HDBSCAN attached
    - hdbscan_silhouette_score: Cluster means from HDBSCAN clustering
    """
    
    print('\n')
    print(' --------------- HDBSCAN ---------------')

    # Set seed for reproducability 
    random.seed(42)
    
    # Tune over the the optimal min_cluster_size and min_samples parameters using the silhouette score as metric
    range_min_cluster_size = range(20, 50)
    range_min_samples = range(5, 20)
    scores = []

    # Looping over ranges og hyperparameters, perform HDBSCAN and attach silhouette score to scores object
    for min_cluster_size in range_min_cluster_size:
        for min_samples in range_min_samples:
            hdbscan_clustering = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
            hdbscan_clustering.fit(df_transformed)
            labels = hdbscan_clustering.labels_
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            if n_clusters > 1:
                score = silhouette_score(df_transformed, labels)
                scores.append((min_cluster_size, min_samples, score))
            else:
                scores.append((min_cluster_size, min_samples, -1))

    # Extract best params seperately to use in HDBSCAN function
    best_min_cluster_size, best_min_samples, _ = max(scores, key=lambda x: x[2])
    print(f'Best min_cluster_size value: {best_min_cluster_size}')
    print(f'Best min_samples value: {best_min_samples}')

    # Perform HDBSCAN clustering with the optimal parameters
    hdbscan_clustering = hdbscan.HDBSCAN(min_cluster_size=best_min_cluster_size, min_samples=best_min_samples)
    hdbscan_clustering.fit(df_transformed)

    # Remove PCA transformation to interpret means of clusters
    df_inv = pca.inverse_transform(df_transformed)

    # Create a dataframe with the inverse PCA transformed values
    df_inv_pca = pd.DataFrame(df_inv, columns=original_df.columns)

    # Revert MinMax scaling on the relevant columns
    columns_of_interest = ['average_monthly_sales','average_order_size','average_value_by_order','unique_customer_per_item','average_monthly_quantity','profit_margin','Net Weight','Reorder Quantity','Unit Cost','First Purch Order Quantum','average_days_between_sales','Customer_life_time','average_monthly_sales_growth','sales_variability']
    df_inv_pca[columns_of_interest] = scaler.inverse_transform(df_inv_pca[columns_of_interest])

    # Creating a dataframe with the cluster numbers attached and selecting columns of interest
    hdbscan_df = df_inv_pca[columns_of_interest].copy()
    hdbscan_df.loc[:, 'cluster'] = hdbscan_clustering.labels_
    hdbscan_df.loc[:, 'item_number'] = original_df.index

    # Create a new dataframe with item numbers and their corresponding cluster labels
    item_cluster_df = hdbscan_df[['item_number', 'cluster']]

    # Group by cluster and compute mean values for selected columns of interest
    hdbscan_means = hdbscan_df.groupby('cluster').mean() 

    # Calculate the silhouette score for the best HDBSCAN clustering
    hdbscan_silhouette_score = silhouette_score(df_transformed, hdbscan_clustering.labels_)

    # Create a subplot with two scatter plots side by side
    fig = make_subplots(rows=1, cols=2, subplot_titles=('PC1 vs. PC2', 'PC3 vs. PC4'))

    # Add first scatter plot to the first subplot of PC1 and PC2
    fig.add_trace(px.scatter(df_transformed, x='PC1', y='PC2', color=hdbscan_clustering.labels_, hover_name=original_df.index).data[0], row=1, col=1)

    # add second scatter plot to the second subplot of PC3 and PC4
    fig.add_trace(px.scatter(df_transformed, x='PC3', y='PC4', color=hdbscan_clustering.labels_, hover_name=original_df.index).data[0], row=1, col=2)

    # Adjust plot
    fig.update_layout(height=500, width=1000, title_text="Clustered Data",
                      legend=dict(x=0.5, y=-0.1, orientation='h', title='Cluster Labels'))
    
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
    fig.show()

    # Print the print cluster means and HDBSCAN silhoette score
    print('/n')
    print('Cluster means:')
    print(hdbscan_means)
    print('/n')
    print('Silhouette score:')
    print(hdbscan_silhouette_score)

    return hdbscan_means, hdbscan_df, hdbscan_silhouette_score



def descriptive_clust(hdbscan_df):
    """
    Extract cluster means and create spider diagram statistics from the output DataFrame from the HDBSCAN clustering which includes cluster labels for each SKU

    :parameters:
    - HDBSCAN DataFrame
    
    :return:
    - Cluster means
    - Spider diagram

    """
    
    # Group by cluster and compute mean values for selected columns of interest
    columns_of_interest = ['average_monthly_sales', 'average_order_size', 'average_value_by_order', 'unique_customer_per_item', 'average_monthly_quantity', 'profit_margin', 'Net Weight', 'Reorder Quantity', 'Unit Cost', 'First Purch Order Quantum', 'average_days_between_sales', 'Customer_life_time', 'average_monthly_sales_growth', 'sales_variability']
    hdbscan_means = hdbscan_df.groupby('cluster')[columns_of_interest].mean() 

    # Scale the data using StandardScaler prior to spider diagram
    scaler = MinMaxScaler()
    scaled_means = scaler.fit_transform(hdbscan_means)
    scaled_means = pd.DataFrame(scaled_means, columns=columns_of_interest, index=hdbscan_means.index)

    # Filter out cluster -1 (outliers)
    scaled_means = scaled_means.drop(index=-1)

    # Define a dictionary to map each cluster to a specific color
    cluster_colors = {-1: 'rgba(41, 57, 71, 1)',
                      0: 'rgba(93, 112, 127, 1)',
                      1: 'rgba(80, 177, 200, 100)',
                      2: 'rgba(154, 177, 197, 100)',
                      3: 'rgba(70, 115, 171, 100)',
                      4: 'rgba(28, 59, 95, 100)'}


    # Create spider diagram
    fig = go.Figure()
    for i, cluster in enumerate(scaled_means.index):
        values = scaled_means.loc[cluster].values.tolist()
        fig.add_trace(go.Scatterpolar(
            r=values,
            theta=columns_of_interest,
            fill='toself',
            name=f'Cluster {cluster}',
            line=dict(width=3),
            marker=dict(size=8, color=cluster_colors[cluster]), 
            opacity=0.7, 
            fillcolor=cluster_colors[cluster] 
        ))

    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]  
            )
        ),

        title=dict(
            text='Cluster Means - Spider Diagram',
            x=0.5, 
            y=0.95 
        ),

        plot_bgcolor='rgba(255, 255, 255, 1)',
        height=800 
    )

    fig.update_layout(
        legend=dict(orientation='h', yanchor='bottom', y=-0.1, xanchor='center', x=0.5),
        legend_title=""
    )

    fig.show()



# Clustering without PCA

In [None]:
def main_wo_pca():
    """
    Main function for clustering analysis excluding PCA, including all steps necessary.
    Change path to data and allow access to Gdrive for the function to execute.

    :parameters: 
    - None
    
    :return: 
    - hca_means: Cluster means from HCA clustering
    - hca_df: Original DataFrame with cluster labels from HCA attached
    - hca_ss: HCA clustering silhouette score
    - hdbscan_means: Cluster means from HDBSCAN clustering
    - hdbscan_df: Original DataFrame with cluster labels from HDBSCAN attached
    - hdbscan_ss: Cluster means from HDBSCAN clustering
    """

    random.seed(42)
    
    # Load and preprocess data
    df = pd.read_csv('/content/drive/MyDrive/Master Thesis/final_updated_clustering_data.csv')

    # Preprocess data
    df_preprocessed, scaler = preprocess(df)

    # Perform HCA clustering
    hca_means, hca_df, hca_ss = perform_hca_clustering(df_preprocessed, scaler, df_preprocessed)

    # Perform DBSCAN clustering
    hdbscan_means, hdbscan_df,hdbscan_ss = perform_hdbscan_clustering(df_preprocessed, scaler, df_preprocessed)

    return hca_means, hca_df, hca_ss, hdbscan_means, hdbscan_df,hdbscan_ssX


def text_preprocessing(df):
    """
    Preprocessing function for textual data to extract additional information through clustering

    :parameters:
    - Dataframe with column '_description' containing textual data

    :return: 
    - DataFrame with an additional column containing text cluster labels and wordclouds for visualization of the data
    """

    # Initiate constant objects
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    # Tokenize the text, remove punctuation and stopwords, and stem words
    df['_description'] = df['_description'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower().split())
    df['_description'] = df['_description'].apply(lambda x: [word for word in x if not word in stop_words])
    df['_description'] = df['_description'].apply(lambda x: [stemmer.stem(word) for word in x])

    # Perform TF-IDF weighting on the text column
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_data = tfidf_vectorizer.fit_transform(df['_description'].apply(lambda x: " ".join(x)))

    # Dimensionality reduction using UMAP
    umap_model = umap.UMAP(n_neighbors=15, min_dist=0.5, metric='euclidean', random_state=42)
    umap_data = umap_model.fit_transform(tfidf_data)

    # Clustering using HDBSCAN
    clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
    cluster_labels = clusterer.fit_predict(umap_data)

    # Add clustered column to the DataFrame
    df['cluster'] = cluster_labels

    # Extract 10 most common words for each cluster + create wordcloud for each cluster with 50 most common words
    for label in df['cluster'].unique():

        print(f"Cluster {label}:")
        cluster_df = df.loc[df['cluster'] == label]
        words = [word for doc in cluster_df['_description'] for word in doc]

        # Find 10 most common words
        word_counts = collections.Counter(words)
        top_10_words = word_counts.most_common(10)
        print('Top 10 words in clusters:')
        print(top_10_words)

        # Create word cloud
        print(f'Wordcloud for cluster {label}')
        word_counts = collections.Counter(words)
        top_50_words = word_counts.most_common(50)
        top_words_dict = dict(top_50_words)
        wordcloud = WordCloud(width = 800, height = 800, background_color ='white', stopwords = stop_words, min_font_size = 10).generate_from_frequencies(top_words_dict)

        # Plot word cloud
        plt.figure(figsize = (8, 8), facecolor = None)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad = 0)
        plt.show()

    return df

def preprocess(df):
  """
  Dedicated preprocessing function for handling numerical, categorical, and textual data (:using dedicated text_preprocessing function)

  :parameters:
  - DataFrame used for clustering

  :return:
  - Preprocessed DataFrame
  - Scaler object from MinMaxScaler for later inverting
  """

  # Removing all -inf profit_margins 
  df = df[(df['profit_margin'] >= 0) & (df['profit_margin'] != float('inf')) & (df['profit_margin'] != float('-inf'))]

  # Drop redundant columns
  exclude_columns = ["Unnamed: 0","BDLRI Sales Status"]
  df = df.loc[:, ~df.columns.isin(exclude_columns)]

  # One-hot encode categorical variables
  df = pd.concat([df, pd.get_dummies(df['Country_Region of Origin Code'], prefix='Country')], axis=1).drop('Country_Region of Origin Code', axis=1)

  # Preprocess description using seperate function
  df = text_preprocessing(df)
  
  # Scale the columns separately
  scaler = MinMaxScaler()
  df[['average_monthly_sales','average_order_size','average_value_by_order','unique_customer_per_item','average_monthly_quantity','profit_margin','Net Weight','Reorder Quantity','Unit Cost','First Purch Order Quantum','average_days_between_sales','Customer_life_time','average_monthly_sales_growth','sales_variability']] = scaler.fit_transform(df[['average_monthly_sales','average_order_size','average_value_by_order','unique_customer_per_item','average_monthly_quantity','profit_margin','Net Weight','Reorder Quantity','Unit Cost','First Purch Order Quantum','average_days_between_sales','Customer_life_time','average_monthly_sales_growth','sales_variability']])

  # remove and handlenan rows
  df = df.dropna(subset=['_ItemNumber'])
  df = df.fillna(0)
  df = df.set_index('_ItemNumber')
  df.columns = df.columns.astype(str)

  # Create a new DataFrame with the original _description column and cluster labels
  description_clusters = df.reset_index()[['_ItemNumber', '_description', 'cluster']]

  # Group the DataFrame by cluster labels
  grouped_descriptions = description_clusters.groupby('cluster')

  # Drop description at is has been preprocessed earlier
  df = df.drop(columns=['_description'])

  return df, scaler



def perform_hca_clustering(df_transformed, scaler, original_df):
    """
    Perform HCA on a preprocessed DataFrame. After clustering scaling will be reverted to get true mean values for clusters

    :parameters:
    - Preprocessed dataframe
    - MinMaxScaler object
    - Original DataFrame

    :return:
    - HCA_means: Cluster means from HCA clustering
    - HCA_df: Original DataFrame with cluster labels from HCA attached
    - HCA_silhouette_score: Cluster means from HCA clustering
    """
    
    print('\n')
    print(' --------------- HIERARCHICAL CLUSTERING ---------------')

    # Set seed for reproducability 
    random.seed(42)

    # Define range of number of clusters to evaluate
    n_clusters_range = range(2, 6)

    # Calculate the silhouette score for each cluster number
    scores = []

    # Loop over each number of clusters, perform HCA, and attach silhouette score to scores list
    for n_clusters in n_clusters_range:
        hca = AgglomerativeClustering(n_clusters=n_clusters)
        hca.fit(df_transformed)
        labels = hca.labels_
        score = silhouette_score(df_transformed, labels)
        scores.append(score)

    # Plot the silhouette scores across number of clusters to examine closer
    plt.plot(n_clusters_range, scores)
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette score')
    plt.show()

    # Choose the number of clusters with the highest cross-validation score
    best_n_clusters = n_clusters_range[np.argmax(scores)]
    print(f"Best number of clusters: {best_n_clusters}")

    # Perform hierarchical clustering with best_n_clusters
    HCA_clustering = AgglomerativeClustering(n_clusters=best_n_clusters)
    HCA_clustering.fit(df_transformed)

    # Revert MinMax scaling only on the relevant columns
    columns_of_interest = ['average_monthly_sales','average_order_size','average_value_by_order','unique_customer_per_item','average_monthly_quantity','profit_margin','Net Weight','Reorder Quantity','Unit Cost','First Purch Order Quantum','average_days_between_sales','Customer_life_time','average_monthly_sales_growth','sales_variability']
    df_transformed[columns_of_interest] = scaler.inverse_transform(df_transformed[columns_of_interest])

    # Creating a dataframe with the cluster numbers attached and selecting columns of interest
    HCA_df = df_transformed[columns_of_interest].copy()
    HCA_df.loc[:, 'cluster'] = HCA_clustering.labels_
    HCA_df.loc[:, 'item_number'] = original_df.index

    # Group by cluster and compute mean values for selected columns of interest
    HCA_means = HCA_df.groupby('cluster')[columns_of_interest[1:]].mean()

    # Calculate final silhouette score for the best HCA clustering
    hca_silhouette_score = silhouette_score(df_transformed, HCA_clustering.labels_)

    # Print the print cluster means and HCA silhoette score
    print('/n')
    print('Cluster means:')
    print(HCA_means)
    print('/n')
    print('Silhouette score:')
    print(hca_silhouette_score)

    return HCA_means, HCA_df, hca_silhouette_score



def perform_hdbscan_clustering(df_transformed, scaler, original_df):
    """
    Perform HDBSCAN on a preprocessed DataFrame. After clustering scaling will be reverted to get true mean values for clusters

    :parameters:
    - Preprocessed dataframe
    - MinMaxScaler object
    - Original DataFrame

    :return:
    - HDBSCAN_means: Cluster means from HDBSCAN clustering
    - HDBSCAN_df: Original DataFrame with cluster labels from HDBSCAN attached
    - HDBSCAN_silhouette_score: Cluster means from HDBSCAN clustering
    """
    print('\n')
    print(' --------------- HDBSCAN ---------------')

    # Set seed for reproducability 
    random.seed(42)
    
    # Tune over the the optimal min_cluster_size and min_samples parameters using the silhouette score as metric
    range_min_cluster_size = range(20, 50)
    range_min_samples = range(5, 20)
    scores = []

    # Looping over ranges og hyperparameters, perform HDBSCAN and attach silhouette score to scores object
    for min_cluster_size in range_min_cluster_size:
        for min_samples in range_min_samples:
            hdbscan_clustering = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
            hdbscan_clustering.fit(df_transformed)
            labels = hdbscan_clustering.labels_
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            if n_clusters > 1:
                score = silhouette_score(df_transformed, labels)
                sil_scores.append((min_cluster_size, min_samples, score))
            else:
                sil_scores.append((min_cluster_size, min_samples, -1))

    # Extract best params seperately to use in HDBSCAN function
    best_min_cluster_size, best_min_samples, _ = max(sil_scores, key=lambda x: x[2])
    print(f'Best min_cluster_size value: {best_min_cluster_size}')
    print(f'Best min_samples value: {best_min_samples}')

    # Perform HDBSCAN clustering with the optimal parameters
    hdbscan_clustering = hdbscan.HDBSCAN(min_cluster_size=best_min_cluster_size, min_samples=best_min_samples)
    hdbscan_clustering.fit(df_transformed)

    # Revert MinMax scaling only on the relevant columns
    columns_of_interest = ['average_monthly_sales','average_order_size','average_value_by_order','unique_customer_per_item','average_monthly_quantity','profit_margin','Net Weight','Reorder Quantity','Unit Cost','First Purch Order Quantum','average_days_between_sales','Customer_life_time','average_monthly_sales_growth','sales_variability']
    df_transformed[columns_of_interest] = scaler.inverse_transform(df_transformed[columns_of_interest])

    # Creating a dataframe with the cluster numbers attached and selecting columns of interest
    hdbscan_df = df_transformed[columns_of_interest].copy()
    hdbscan_df.loc[:, 'cluster'] = hdbscan_clustering.labels_
    hdbscan_df.loc[:, 'item_number'] = original_df.index

    # Create a new dataframe with item numbers and their corresponding cluster labels
    item_cluster_df = hdbscan_df[['item_number', 'cluster']]

    # Group by cluster and compute mean values for selected columns of interest
    hdbscan_means = hdbscan_df.groupby('cluster')[columns_of_interest].mean()  # -1 are outliers

    # Calculate the silhouette score for the best HDBSCAN clustering
    hdbscan_silhouette_score = silhouette_score(df_transformed, hdbscan_clustering.labels_)

    # Print the print cluster means and HDBSCAN silhoette score
    print('/n')
    print('Cluster means:')
    print(hdbscan_means)
    print('/n')
    print('Silhouette score:')
    print(hdbscan_silhouette_score)

    return hdbscan_means, hdbscan_df, hdbscan_silhouette_score

# Composite index

In [None]:
def calculate_composite_index(df):
    """
    Function for calculating and plotting composite index mapping of two dimensions.
    Weightings should be updated accordingly

    :parameters:
    - DataFrame used for composite index

    :return:
    - 2D plot of clusters
    - Full cluster DataFrame including two new columns, one with impact score and one with risk score
    - DataFrame with impact/risk scores for all clusters
    """


    # Initiate weights for the respective dimensions
    impact_weights = {'Customer_life_time': 0.25, 'average_monthly_sales': 0.20, 'average_monthly_quantity': 0.15,
                      'average_value_by_order': 0.15, 'profit_margin': 0.10, 'average_monthly_sales_growth': 0.05,
                      'average_order_size': 0.05, 'Unit Cost': 0.05}

    risk_weights = {'average_days_between_sales': 0.25, 'sales_variability': 0.25, 'unique_customer_per_item': 0.25,
                    'Net Weight': 0.05, 'Reorder Quantity': 0.05, 'First Purch Order Quantum': 0.05}

    # Scale the features
    scaler = MinMaxScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

    # Calculate scores
    impact_score = (scaled_df[list(impact_weights.keys())] * list(impact_weights.values())).sum(axis=1) / sum(list(impact_weights.values()))
    risk_score = (scaled_df[list(risk_weights.keys())] * list(risk_weights.values())).sum(axis=1) / sum(list(risk_weights.values()))

    # Adjust the index of impact_score and risk_score
    impact_score.index -= 1
    risk_score.index -= 1

    # Create a new dataframe with the cluster means and their impact and risk scores
    result_df = pd.concat([df, impact_score, risk_score], axis=1)
    result_df.columns = list(df.columns) + ['Impact score', 'Risk score']

    # Create a new dataframe with just the index and the impact and risk scores
    result_impact_risk = result_df[['Impact score', 'Risk score']]
    
    # Define a dictionary to map each cluster to a specific color
    cluster_colors = {-1: 'rgba(41, 57, 71, 1)',
                      0: 'rgba(128, 128, 128, 1)',
                      1: 'rgba(93, 112, 127, 1)',
                      2: 'rgba(155, 170, 181, 1)',
                      3: 'rgba(80, 177, 200, 100)',
                      4: 'rgba(70, 115, 171, 100)'}

    # Reset the index of the result_df to make the index a column
    result_df = result_df.reset_index()

    # Create a scatter plot with color-coded points for each cluster
    fig = px.scatter(result_df, x='Impact score', y='Risk score', color='index',
                    color_discrete_map=cluster_colors, width=600, height=600)

    # Update the layout of the scatter plot
    fig.update_layout(
        plot_bgcolor='white',
        legend=dict(
            orientation='h',
            yanchor='bottom',
            y=1.05,
            xanchor='right',
            x=1
        ),
        xaxis=dict(range=[0, 1]),
        yaxis=dict(range=[0, 1])
    )
    fig.update_traces(marker=dict(size=8))
    fig.update_traces(marker={'size': 25})
    fig.show()

    
    return result_df, result_impact_risk


