# Notebook for generating FAQ with Hierarchcal Agglomerative Clustering

## Setup

In [None]:
import os
import json
import requests
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import pandas as pd
from pyspark.sql.functions import col, explode, collect_list, struct, udf, concat, monotonically_increasing_id, to_json, when, from_json, concat_ws, current_timestamp, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, ArrayType, FloatType
from bs4 import BeautifulSoup
from synapse.ml.featurize.text import PageSplitter
from abc import ABC, abstractmethod 
import openai
import numpy as np
import re
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline 
from nltk.corpus import stopwords 

import uuid 
from transformers import AutoTokenizer

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from datetime import datetime, timedelta
import httpx

## Load data

In [None]:
# SAS URL of the blob with JSON file
sas_url = "https://aka.ms/funwithteams"

# Download the blob content using the SAS URL
response = requests.get(sas_url)
blob_content = response.content

# Parse the JSON content
try:
    json_content = json.loads(blob_content)
    print("JSON is valid.")
except json.JSONDecodeError as e:
    print(f"Invalid JSON: {e}")
    json_content = None

# Print the first 20 lines of the JSON content if valid
if json_content:
    json_str = json.dumps(json_content, indent=4)
    json_lines = json_str.split('\n')
    for line in json_lines[:20]:
        print(line)

## EDA

In [None]:
# Normalize the JSON data into a flat table
df = pd.json_normalize(json_content)

# Print the DataFrame
df.head(10)

## Extract conversational texts from Teams Json

In [None]:
# Function to extract 'body->content' from JSON string
def extract_body_content(json_str):
    try:
        #json_data = json.loads(str(json_str))
        #return json_data['body']['content']
        return json_str[0]["body"]["content"]
    except (json.JSONDecodeError, KeyError) as e:
        print(e)
        return None

# Apply the function to create a new column 'parent_post_text'
df['parent_post_text'] = df['Thread.parent.value'].apply(extract_body_content)

df.head(30)

## Example of a message that contains a tag of a person's name

In [None]:
# Display full content of parent_post_text in row with id = messages_Chemistry_20250320_055839_07f3eb8e
full_content = df.loc[df['Id'] == 'messages_Chemistry_20250320_055839_07f3eb8e', 'parent_post_text'].values
print(full_content)


## Segment parent posts into refined questions with GPT

In [None]:
#Define the function to segment post into questions
def segment_post_into_questions(parent_post_text):
    prompt = """
    You are given a message thread of someone addressing a team with one or more questions. Take it, remove individual names and other "fluff" and 
    break it down into concise self-contained questions, presented in json form of the following structure: 
    [{"question": "How to synthesize a protein with specific properties?"}].
    If the message contains no questions, return empty JSon array, without comments.
    """
    messages = [
        {
            "role": "system",
            "content": prompt
        }
    ]

    content = "Message: "
    if parent_post_text is not None:
        content += parent_post_text + "\n\n"

    messages.append(
        {
            "role": "user",
            "content": content,
        },
    )

    
    attempts = 0
    result = ''
    
    print(f"start to work on input: {parent_post_text}")
    while attempts < 10 and result == '':
        try:
            attempts+=1 

            response = openai.ChatCompletion.create(
                #deployment_id='gpt-35-turbo-0125', # see the note in the cell below for an alternative deployment_id.
                deployment_id="gpt-4-32k",
                messages= messages,
                temperature=0,
            ) 

            result = response.choices[0].message.content
            #print(result)
        except Exception as e:
            print(e)
            print(f"sleeping, attempt {attempts}")
            import time
            time.sleep(attempts*19)

    return result if result !='' else 'Failure'

Test the function on a post that is a statement, without any embedded questions

In [None]:
segment_post_into_questions("'Catalysts play a critical role in lowering the activation energy of a chemical reaction, allowing it to proceed more quickly and efficiently without being consumed in the process. In industrial applications like the Haber process for ammonia production, iron-based catalysts are used to facilitate the reaction between nitrogen and hydrogen under optimized temperature and pressure conditions.'")

Test the function on a "normal" type of post, with one question

In [None]:
segment_post_into_questions("Hey team, let's discuss how water's unique properties (like high specific heat, surface tension, and being a universal solvent) arise from its molecular structure and hydrogen bonding. <at id=\"0\">Sam Carter</at>, <at id=\"1\">Kim Lee</at>, what are your thoughts?")

Apply the function to our dataset of ~1k conversations

In [None]:
#df_segmented = df.head(30)
df_segmented = df
df_segmented['questions'] = df_segmented['parent_post_text'].apply(segment_post_into_questions)
df_segmented

Sample result:

In [None]:
# Display full content of parent_post_text in row with id = messages_Chemistry_20250320_055839_07f3eb8e
questions = df_segmented.loc[df_segmented['Id'] == 'messages_Chemistry_20250320_053736_cf657902', 'questions'].values
print(questions)

## Expand to have an individual row for each question

In [None]:
# Function to expand questions into multiple rows
def expand_questions(value):
    try:
        questions_json = json.loads(value)
        return questions_json
    except json.JSONDecodeError:
        return []
 
df_segmented['questions_json'] = df_segmented['questions'].apply(expand_questions)
expanded_df = df_segmented.explode('questions_json').reset_index(drop=True)
expanded_df.head(20)

In [None]:
expanded_df.shape

## Clean rows that have no questions

In [None]:
# Filter the dataframe to non-empty values of questions_json
filtered_df = expanded_df[expanded_df['questions_json'].notna()]
filtered_df

In [None]:
filtered_df.shape

In [None]:
# Display full content of parent_post_text in row with id = messages_Chemistry_20250320_055839_07f3eb8e
questions = filtered_df.loc[filtered_df['Id'] == 'messages_Chemistry_20250320_051407_e3396f52', 'questions_json'].values
print(questions, type(questions))

## Crack JSON open to get the refined questions out

In [None]:
def expand_questions(value): 
    try: 
        return value['question']
    except json.JSONDecodeError:
        return ''

# Make a column 'question_refined' with value of 'question'
filtered_df['question_refined'] = filtered_df['questions_json'].apply(expand_questions)
filtered_df

In [None]:
filtered_df.shape

## Shuffle rows in the dataset - don't have to do it, but makes it more interesting!

In [None]:
shuffled_df = filtered_df.sample(frac=1).reset_index(drop=True)
shuffled_df

## Embed the content of questions with "text-embedding-ada-002" from Azure OpenAI

In [None]:
def generate_embedding(text): 
    deployment_id = "text-embedding-ada-002"     
    attempts = 0
    result = []
    
    print(f"start to work on input: {text}")
    while attempts < 10 and result == []:
        try:
            attempts+=1 
            query_embedding = openai.Embedding.create(deployment_id=deployment_id, input=text) 
            result = query_embedding.data[0].embedding 
        except Exception as e:
            print(e)
            print(f"sleeping, attempt {attempts}")
            import time
            time.sleep(attempts*19)

    return result  

Apply the embeddings function to all rows

In [None]:
shuffled_df['embeddings'] = shuffled_df['question_refined'].apply(generate_embedding)
shuffled_df_with_embeddings = shuffled_df
shuffled_df_with_embeddings.head(10)

Check that we have cleaned all rows which have no questions

In [None]:
pdf = shuffled_df_with_embeddings
none_count = pdf['question_refined'].isnull().sum()
none_count

In [None]:
# fix this!
pdf_cleaned = pdf.dropna(subset=['question_refined'])

Transform to numeric format:

In [None]:
# Convert each inner list to numeric values
pdf_cleaned['Embedding_list'] = pdf_cleaned['embeddings'].apply(lambda x: np.array([pd.to_numeric(i, errors='coerce') for i in x])) 

pdf_cleaned

# THIS IS IT!! The Beautiful Clustering.... 🌸🪷🌼🌻🌷🪻

In [None]:
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt

# Calculate the cosine distance matrix
cosine_distances = pdist(pdf_cleaned['Embedding_list'].tolist(), metric='cosine')


# Perform hierarchical clustering using the cosine distance matrix
Z = linkage(cosine_distances, method='ward')

# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram (Cosine Similarity)')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

Let's pick 100 clusters

In [None]:
clusters_count = 100

Print which questions ended up in each cluster, from 1st to 100th...

In [None]:
from scipy.cluster.hierarchy import fcluster

# Function to display clusters and their members
def display_clusters(Z, pdf_cleaned, num_clusters=200):
    # Create clusters from the linkage matrix
    clusters = fcluster(Z, num_clusters, criterion='maxclust')
    
    # Add cluster labels to the DataFrame
    pdf_cleaned['Cluster'] = clusters
    
    # Display clusters and their members
    for cluster in set(clusters):
        print(f"\nCluster {cluster}:")
        cluster_data = pdf_cleaned[pdf_cleaned['Cluster'] == cluster]
        for index, row in cluster_data.iterrows():
            print(f"\n\nThread id:  {row['Id']}, Question: {row['question_refined']}, Theme:  {row['Theme']}, Original post: {row['parent_post_text']}")

# Display clusters and their members
display_clusters(Z, pdf_cleaned, num_clusters=clusters_count)

Now display the contents of each cluster in descending order by size (to help us find most frequently asked questions!)

In [None]:
# Count the number of members in each cluster
cluster_counts = pdf_cleaned['Cluster'].value_counts()

# Print the number of elements in each cluster and examples of questions inside the clusters
print("\nNumber of elements in each cluster and examples of questions inside the clusters:")
for cluster, count in cluster_counts.items():
    print(f"\n\nCluster {cluster}: {count} elements")
    cluster_data = pdf_cleaned[pdf_cleaned['Cluster'] == cluster]
    for index, row in cluster_data.iterrows():  # Displaying first 3 examples from each cluster
        print(f"\n\nThread id:  {row['Id']}, Question: {row['question_refined']}, Theme:  {row['Theme']}, Original post: {row['parent_post_text']}")


# Summarize the contents of each cluster - compress groups of questions with the help of GPT

In [None]:
def find_common_theme(text, verbose=1, questions_only = 1):
    # Craft the prompt
    if questions_only:
        prompt = f"Please find common thene in the given questions and context:\
                \n\n{text}\n\n\
                Based on that, formulate one aggregated most popular question (ideally) or few questions based on trends in the provided data. Answer Question(s): <> "
    else: 
        prompt = f"Please find common thene in the given questions and context:\
                \n\n{text}\n\n\
                Formulate common theme as a topic,  a set of keywords and  one aggregated most popular question (ideally) or few questions based on trends in the provided data. Answer as Topic: <>,  Keywords: <>, Question(s): <> "
        

    messages = [
        {
            "role": "system",
            "content": """You are a helpful  assistant who will be provided text information to generate FAQ \
                """
        }
    ]

    messages.append(
        {
            "role": "user", 
            "content": "question:" + prompt,
        },
    )

    attempts = 0
    result = ''
    
    #print(f"start to work on input: {text}")
    while attempts < 10 and result == '':
        try:
            attempts+=1 

            response = openai.ChatCompletion.create(
                #deployment_id='gpt-35-turbo-0125', # see the note in the cell below for an alternative deployment_id.
                deployment_id="gpt-4-32k",
                messages= messages,
                temperature=0,
            ) 

            result = response.choices[0].message.content
            #print(result)
        except Exception as e:
            print(e)
            print(f"sleeping, attempt {attempts}")
            import time
            time.sleep(attempts*19)

    return result if result !='' else 'Failure' 


Apply the function only to clusters that have 5 or more questions

In [None]:
# Count the number of members in each cluster
cluster_counts = pdf_cleaned['Cluster'].value_counts()

# Filter clusters with more than 5 entries and sort in descending order
filtered_clusters = cluster_counts[cluster_counts > 5].sort_values(ascending=False)

themes = []
# Print the number of elements in each cluster and concatenated questions
#print("\nNumber of elements in each cluster and examples of questions inside the clusters (more than 5 entries):")
for cluster, count in filtered_clusters.items():
    #print(f"\n\nCluster {cluster}: {count} elements")
    cluster_data = pdf_cleaned[pdf_cleaned['Cluster'] == cluster]
    questions = []
    parent_posts = []
    for index, row in cluster_data.iterrows():
        questions.append(f"Question: {row['question_refined']}")
        # print(f"\n\nThread id:  {row['Id']}, Question: {row['question_refined']}, Theme:  {row['Theme']}, Original post: {row['parent_post_text']}")
        parent_posts.append(f"Post: {row['parent_post_text']}")
      
    result = "  Questions: " + " | ".join(questions)
    theme = find_common_theme(result)
    parent_posts = "  Parent posts: " + " | ".join(parent_posts)    
    themes.append((theme, count))
    #print(f"\n{theme}; \n\n {parent_posts}")

print("~~~FAQ~~~~")
for theme, num in themes:
    print(theme, f" (asked {num} times)\n =================== \n")

## Augment with answers - homework exercise 😊🙌