### imports

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from IPython.display import clear_output
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import json

# Ignore all warnings
warnings.filterwarnings("ignore")

# init embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


  from .autonotebook import tqdm as notebook_tqdm


### global variables

In [2]:
def print_process(discription_str, current_iteration, num_of_iterations):
    clear_output(wait=True)
    percent = int((current_iteration/(num_of_iterations)) * 100)
    print(discription_str+" "+str(percent)+"%")

In [3]:
# ------------- configurations -------------
MIN_CLUSTERS = 50
MAX_CLUSTERS = 200

# the name of the column that include the captions
CAPTIONS_COLUMN_NAME = "Caption"
# the name of the column that includes (will include) the posts captions embeddings
EMBEDDINGS_COLUMN_NAME = "caption_embedding"


# ------------- files url-------------
script_outputs_folder_url = './output-data'
script_inputs_folder_url = './input-data'

posts_data_file_url = f"{script_inputs_folder_url}/nfl_posts_by_hashtag.parquet"
embedded_posts_file_url = f"{script_inputs_folder_url}/embedded_nfl_posts_by_hashtag.parquet"
profiles_data_file_url = f"{script_inputs_folder_url}/nfl_profiles.parquet"

# ------------- flags -------------
embed_posts_captions = False
# must turn these 2 flags togther (otherswise data wont be synchronized)
calculate_optimal_clusters_number = True
calculate_ith_profiles_clusters_cover = True

# ------------- global variables -------------
SEPERATOR = "========================================================================================================="
OPTIMAL_CLUSTERS_NUMBER = 0

## embedding
#### ------------------------------------------------------------------------------------------------------------------------------------------------------------

### embedding posts captions and storing results in a parquet file

In [4]:
if embed_posts_captions:
    # data structures
    # list to save "clean" captions
    clean_caption_list = list()
    # list to save "clean captions" embeddings
    clean_caption_embedding_list = list()
    
    # reading NFL_posts from a parquet file into dataframe
    posts_df = pd.read_parquet(posts_data_file_url)
    
    # embedding posts captions using SentenceTransformer model
    for index, row in posts_df.iterrows():
        # creating "clean" caption
        # removing all non-asci chars ()
        clean_caption = ''.join([char for char in row[CAPTIONS_COLUMN_NAME] if ord(char) < 128])
        # removing all extra spaces and new-lines
        clean_caption = ' '.join([word for word in clean_caption.split()])
        clean_caption_list.append(clean_caption)
        
        # embedding post "clean" caption
        clean_caption_embedding = model.encode(clean_caption)
        clean_caption_embedding_list.append(list(clean_caption_embedding))
        
        # printing embedding proccess percentage
        print_process("Embedding posts captions...",index, (len(posts_df) - 1))
        
    # adding new columns (clean caption + embeddings) into dataframe
    posts_df["Clean_Caption"] = clean_caption_list
    posts_df[EMBEDDINGS_COLUMN_NAME] = clean_caption_embedding_list
    # storing dataframe into a parquet file
    posts_df.to_parquet(embedded_posts_file_url)
    print("Done saving posts with embedding to parquet file successfully")
else:
    print("Embedding posts captions proccess is disabled!.")


Embedding posts captions proccess is disabled!.


#### reading posts details (data + embedded captions) and snowballed nfl teams profiles

In [5]:
# reading profiles data from input parquet file
profiles_df = pd.read_parquet(profiles_data_file_url)
profiles_df.dropna(inplace=True)

# reading posts from a input parquet file
posts_df = pd.read_parquet(embedded_posts_file_url)
posts_df.dropna(inplace=True)

In [6]:
profiles_df.info()
display(profiles_df.head(5))

<class 'pandas.core.frame.DataFrame'>
Index: 11325 entries, 0 to 11326
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   11325 non-null  object 
 1   Nickname             11325 non-null  object 
 2   Post_Count           11325 non-null  float64
 3   Follower_Count       11325 non-null  float64
 4   Following_Count      11325 non-null  int64  
 5   Posts_Count_In_Week  11325 non-null  int64  
 6   Engagement           11325 non-null  int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 707.8+ KB


Unnamed: 0,ID,Nickname,Post_Count,Follower_Count,Following_Count,Posts_Count_In_Week,Engagement
0,28995773,bleacherreport,53286.0,22282444.0,713,169,42912907
1,205593849,nfl,59871.0,29921744.0,2204,161,33917101
2,1254997058,houseofhighlights,31428.0,51962379.0,2185,110,28233366
3,253785656,nflnetwork,16008.0,3984113.0,506,77,12237502
4,1254050784,espnnfl,22638.0,3948795.0,603,85,11310141


In [7]:
posts_df.info()
display(posts_df.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12518 entries, 0 to 12517
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ID                 12518 non-null  object
 1   Caption            12518 non-null  object
 2   Owner_ID           12518 non-null  object
 3   Likes_Count        12518 non-null  int64 
 4   Comments_Count     12518 non-null  int64 
 5   publication_Date   12518 non-null  object
 6   Clean_Caption      12518 non-null  object
 7   caption_embedding  12518 non-null  object
dtypes: int64(2), object(6)
memory usage: 782.5+ KB


Unnamed: 0,ID,Caption,Owner_ID,Likes_Count,Comments_Count,publication_Date,Clean_Caption,caption_embedding
0,3321924617986818393,The Bills are re-signing DT DaQuan Jones. 2 ye...,9174240239,-1,-1,2024-03-12,The Bills are re-signing DT DaQuan Jones. 2 ye...,"[-0.05456706, -0.028283583, -0.012744162, -0.0..."
1,3321927450987513283,🔥Customized Sharing（49/100）🔥\n✈delivered\n\nBi...,51714893107,-1,-1,2024-03-12,Customized Sharing49/100 delivered Bills jerse...,"[-0.06672458, 0.113558196, -0.02331945, -0.052..."
2,3321937043117426576,The Bills have resigned Daquan Jones on a 2 ye...,52222828237,22,-1,2024-03-12,The Bills have resigned Daquan Jones on a 2 ye...,"[-0.047176495, 0.012292344, -0.010614566, -0.0..."
3,3321969411785703617,DT Daquon Jones bleibt für zwei weitere Jahre ...,38454064620,4,-1,2024-03-12,DT Daquon Jones bleibt fr zwei weitere Jahre u...,"[-0.06138219, -0.038200036, -0.03786822, -0.05..."
4,3321989952400201853,Khalil Shakir Orange 🍊 62/249 from 2023 Panini...,8128714351,7,-1,2024-03-12,Khalil Shakir Orange 62/249 from 2023 Panini P...,"[-0.040837407, -0.014204753, -0.0047751833, -0..."


# profiles quality
#### -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### utility functions (will be used in the next cells)

In [8]:
def calculate_clusters_covered_percent_by_profiles(input_profiles, input_clusters_number, input_cluster_labels):
    """
    input: - profiles: a data structure that has nfl profiles
           - input_clusters_number: the number of clusters that we have
           - input_cluster_labels: posts clusters label array (each post to which cluster was labeled)
    
    the function returns the percentage of clusters covered by the profiles.
    """
    # counter to count clusters that was covered by a profile
    count = 0
    found = False;

    # scanning all clusters
    for cluster in range(input_clusters_number):
        vectors_indexes_in_cluster = np.where(input_cluster_labels == cluster)[0]

        # scanning all vectors in the cluster
        for idx in vectors_indexes_in_cluster:  
            # get the details of the post corresponding to the vector
            post_details_row = posts_df[posts_df["ID"] == posts_id_arr[idx]]
            # get the id of the post owner
            owner_id = post_details_row["Owner_ID"].iloc[0]
            # Verify whether the post owner exists in the list of profiles
            if owner_id in input_profiles:
                count = count + 1
                break
                
    return ((count/input_clusters_number) * 100)


def find_optimal_num_clusters(vectors, min_clusters=1, max_clusters=10):
    """
    Find the optimal number of clusters using the elbow method.

    Args:
    - vectors (list of arrays): List of 340-dimensional vectors.
    - max_clusters (int): Maximum number of clusters to consider.

    Returns:
    - optimal_num_clusters (int): Optimal number of clusters.
    """
    # Calculate within-cluster sum of squares (WCSS) for different number of clusters and clusters covered by profiles
    wcss = []
    clusters_covered = []
    
    for num_clusters in range(min_clusters, max_clusters + 1):
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(vectors)
        wcss.append(kmeans.inertia_)
        # calculating number of clusters covered by profiles
        clusters_covered.append(calculate_clusters_covered_percent_by_profiles(profiles_id_arr, num_clusters, kmeans.labels_))
        
        # printing proccess percentage
        print_process("Calculating optimal number of clusters...",(num_clusters - min_clusters + 1), (max_clusters - min_clusters))
    
    # saving results into dataframe
    clusters_range = range(min_clusters, max_clusters + 1)
    optimal_clustering_details_df = pd.DataFrame({"clusters_number":clusters_range,
                                               "wcss":wcss,
                                               "clusters_percentage_covered": clusters_covered}).set_index('clusters_number')
    # saving dataframe into a csv file
    optimal_clustering_details_df.to_csv(f"{script_outputs_folder_url}/optimal-num-of-clusters-calculation-results.csv")
    
    # Find the elbow point
    diff = np.diff(wcss, 2)
    optimal_num_clusters = np.argmin(diff) + min_clusters + 1

    return optimal_num_clusters


def print_optimal_num_of_clusters_calculation_graphs(input_range, input_wcss, input_clusters_covered):
    """
    input: -input_range: the range from 1 to MAX number of clusters
           -input_wcss: wcss values for each number of clusters
           -input_clusters_covered: covered clusters by profiles percent for each number of number of clusters
           
    the functions plot elbow curve and clusters covered percantage graphs
    """
    plt.figure()
    plt.plot(input_range, input_wcss)
    plt.xlabel('Number of clusters')
    plt.ylabel('Within-cluster sum of squares (WCSS)')
    plt.title('Elbow Method')
    plt.show()
    
    plt.figure()
    plt.plot(input_range, input_clusters_covered)
    plt.xlabel('Number of clusters')
    plt.ylabel('Covered clusters percent')
    plt.title('Clusters Covered By Profiles')
    plt.show()

## building KMeans model for the clustering

#### reading relevant columns to build the model from dataframe 

In [9]:
# reading embeddings column from posts dataframe into a nparray 
posts_embedding_vectors_arr = np.array(posts_df[EMBEDDINGS_COLUMN_NAME].apply(np.array).tolist())

# reading posts id column from posts dataframe into a nparray
posts_id_arr = np.array(posts_df['ID'].apply(np.array).tolist())

# reading profiles id column from profilesdf into a nparray
profiles_id_arr = np.array(profiles_df['ID'].apply(np.array).tolist())


#### calculating optimal number of clusters

In [None]:
if calculate_optimal_clusters_number:
    # Find the optimal number of clusters using the elbow method
    OPTIMAL_CLUSTERS_NUMBER = find_optimal_num_clusters(posts_embedding_vectors_arr, MIN_CLUSTERS, MAX_CLUSTERS)
    optimal_clusters_num_df = pd.DataFrame({"-":"Optimal number of clusters", 'Value': [str(OPTIMAL_CLUSTERS_NUMBER)]}).set_index('-')
    optimal_clusters_num_df.to_csv(f"{script_outputs_folder_url}/optimal-clusters-number.csv")
else:
    print("Calculating optional number of clusters proccess is disabled!.")
    # reading optiomal number of clusters from last calculation
    OPTIMAL_CLUSTERS_NUMBER = pd.read_csv(f"{script_outputs_folder_url}/optimal-clusters-number.csv")["Value"].iloc[0]

Calculating optimal number of clusters... 53%


In [None]:
#printing optimal number of clusters using dataframe
optimal_clusters_num_df = pd.read_csv(f"{script_outputs_folder_url}/optimal-clusters-number.csv").set_index('-')
display(optimal_clusters_num_df)

# reading optimal clusters number calculating results df
optimal_clustering_details_df = pd.read_csv(f"{script_outputs_folder_url}/optimal-num-of-clusters-calculation-results.csv")
# Plot the elbow curve and clusters covered percentage
print_optimal_num_of_clusters_calculation_graphs(optimal_clustering_details_df["clusters_number"],
                                                 optimal_clustering_details_df["wcss"],
                                                 optimal_clustering_details_df["clusters_percentage_covered"])

#### building kmeans model with optimal number of clusters found

In [None]:
# Initialize KMeans model with the optimal number of clusters
kmeans = KMeans(n_clusters=OPTIMAL_CLUSTERS_NUMBER, random_state=42)
    
# Fit KMeans model to the vectors
kmeans.fit(posts_embedding_vectors_arr)

# Get cluster labels assigned to each vector
cluster_labels = kmeans.labels_


# extracting posts indexes per cluster
# --> posts_per_cluster_lst[i] will include posts indexes that belong to cluster i

posts_indexes_per_cluster_lst = list()
for cluster in range(OPTIMAL_CLUSTERS_NUMBER):
    posts_indexes_per_cluster_lst.append([index for index, value in enumerate(cluster_labels) if value == cluster])


#### covered clusters (by snowballed profiles) percent

In [None]:
# calculating snowballed profiles quality percent
percent = calculate_clusters_covered_percent_by_profiles(profiles_id_arr, OPTIMAL_CLUSTERS_NUMBER, cluster_labels)

#printing result using dataframe
display(pd.DataFrame({"-":"Clusters Covered", 'Percentage': ["{:.2f}".format(percent)+"%"]}).set_index('-'))

#### i'th profiles quality grapth (covered clusters percent for the first i'th profiles)

In [None]:
if calculate_ith_profiles_clusters_cover:
    # reading posts Owners_Id column from posts dataframe into a nparray
    posts_owners_id_arr = np.array(posts_df['Owner_ID'].apply(np.array).tolist())

    # list to save percentage results
    percentage_lst = list()

    # calculating profiles dataframe length
    profiles_count = len(profiles_id_arr)

    for curr_num_of_profiles in range(1, profiles_count + 1):
        # initializing a set that include all the first i profiles id
        curr_profiles_id_set = set(profiles_id_arr[:curr_num_of_profiles])

        #scanning clusters to check how many were covered by the first ith profiles
        clusters_covered_count = 0
        for cluster in range(OPTIMAL_CLUSTERS_NUMBER):
            # extracting posts owners id in the current cluster
            posts_owners_id_in_cluster_set = set([posts_owners_id_arr[idx] for idx in posts_indexes_per_cluster_lst[cluster]])
            # checking if the cluster was covered by the current profiles set
            intersaction_set = curr_profiles_id_set.intersection(posts_owners_id_in_cluster_set)
            if len(intersaction_set) > 0:
                clusters_covered_count = clusters_covered_count + 1

        percentage_lst.append((clusters_covered_count / OPTIMAL_CLUSTERS_NUMBER) * 100)
        #printing process percentage
        print_process("Calculating profiles quality...", curr_num_of_profiles, profiles_count)


    # creating a list that include the number 1 to len(profiles_df)
    amount_of_profiles_lst = list(range(1, len(profiles_id_arr) + 1))

    # creating dataframe with results
    covered_clusters_results_df = pd.DataFrame({'profiles_count': amount_of_profiles_lst,
                                   'percentage': percentage_lst})
    #saving results into a csv file
    covered_clusters_results_df.to_csv(f"{script_outputs_folder_url}/covered-clusters-i-profiles.csv")


# plot first i'th profiles cover results graph
covered_clusters_results_df = pd.read_csv(f"{script_outputs_folder_url}/covered-clusters-i-profiles.csv")
plt.plot(covered_clusters_results_df["profiles_count"], covered_clusters_results_df["percentage"])
plt.xlabel("amount of profiles")  # Set x-axis label
plt.ylabel("Clusters Covered percentage %")  # Set y-axis label
plt.show()

#### creating clusters captions dataframe

In [None]:
cluster_captions_list = list()
all_captions_list = posts_df[CAPTIONS_COLUMN_NAME]
columns_names = list()

# scanning clusters to save each post caption in specific cluster
for cluster in range(OPTIMAL_CLUSTERS_NUMBER):
    # extracting all captions that belongs to the current cluster
    cluster_captions_list = [all_captions_list[idx] for idx in posts_indexes_per_cluster_lst[cluster]]
    columns_names.append(f"cluster_{cluster+1}")
    

# Transpose the list of lists to switch rows with columns
transposed_lists = list(map(list, zip(*cluster_captions_list)))
# Create a DataFrame from the transposed list with optional column names
clusters_captions_df = pd.DataFrame(transposed_lists, columns=column_names)

#### saving dataframe into parquet file

In [None]:
clusters_captions_df.info()
display(clusters_captions_df.head(5))
clusters_captions_df.to_parquet(f"{script_outputs_folder_url}/clusters-captions.parquet")

## Naming clusters
#### --------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### method 1: most common triple words in cluster (using black list)

#### Preparing a blacklist of words that we wish to exclude from our cluster name tuples

In [None]:
articles = ["a", "an", "the"]
conjunctions = ["and", "but", "or", "nor", "for", "yet", "so", "after", "although", "as", "because", "before", "if", "since", "though", "unless", "until", "when", "where", "while"]
prepositions = ["aboard", "about", "above", "across", "after", "against", "along", "amid", "among", "around",
                "as", "at", "before", "behind", "below", "beneath", "beside", "between", "beyond", "but",
                "by", "concerning", "considering", "despite", "down", "during", "except", "excepting",
                "excluding", "following", "for", "from", "in", "inside", "into", "like", "near", "of", "off",
                "on", "onto", "out", "outside", "over", "past", "regarding", "round", "since", "through",
                "throughout", "till", "to", "toward", "towards", "under", "underneath", "until", "unto",
                "up", "upon", "versus", "via", "with", "within", "without"]
pronouns = ["I", "you", "he", "she", "it", "we", "they", "me", "you", "him", "her", "us", "them",
            "myself", "yourself", "himself", "herself", "itself", "ourselves", "yourselves", "themselves",
            "who", "whom", "whose", "which", "what", "that", "whichever", "whatever", "whoever", "whomever",
            "this", "these", "those", "someone", "somebody", "something", "anyone", "anybody", "anything",
            "everyone", "everybody", "everything", "no one", "nobody", "nothing", "each", "either", "neither",
            "one", "other", "another", "such", "much", "few", "both", "all", "any", "some", "several", "none",
            "every", "many", "more", "most", "enough", "little", "fewer", "fewest", "least", "I", "you", "he",
            "she", "it", "we", "they"]
possessive_determiners = ["my", "your", "his", "her", "its", "our", "your", "their"]
randoms = ["is", "are", "yes", "no", "http", "https", "has", "had", "will", "would", "was", "were", "not",
           "be", "just", "been", "do", "does", "has", "have"]

common_words_set = set(articles + conjunctions + prepositions + pronouns + possessive_determiners + randoms)

#### coverting each caption into unique words list

In [None]:
# a list to store aggresive cleaned posts text - removing symbols and numbers
clean_captions_list = list()
# a list to save each caption words 
clean_captions_unique_words_list = list()

# scanning posts dataframe to aggresive clean posts captions
for idx, row in posts_df.iterrows():
    # aggresive cleaning the texts, removing all numbers and symbols
    # reading caption
    caption = row[CAPTIONS_COLUMN_NAME]
    # removing all non-alphabeta chars
    clean_caption = ''.join([char if char.isalpha() else ' ' for char in caption])
    clean_caption = ' '.join([word for word in clean_caption.split() if (word.isalpha() and len(word) >=3)])
    clean_captions_list.append(clean_caption)


for i in range(len(clean_captions_list)):
    # coverting clean post texts into a unique list of words
    clean_captions_unique_words_list.append(list(set([word for word in clean_captions_list[i].lower().split() if (word not in common_words_set)])))


#### scanning clusters to find most common 3 words tupple

In [None]:
# list to save clusters name
clusters_name_lst = list()

# scanning clusters
for cluster in range(OPTIMAL_CLUSTERS_NUMBER):
    # init a dictionary to count words tuple appearances
    words_tupple_count_dict = dict()
    # extracting cluster caption sentences (each caption is a list of words)
    cluster_captions_words = [clean_captions_unique_words_list[idx] for idx in posts_indexes_per_cluster_lst[cluster]]

    # scanning all captions in cluster to count words tuples appearances
    for caption_words in cluster_captions_words:
        # checking if caption has enough words
        if len(caption_words) >= 3:
            for idx1 in range(len(caption_words)-2):
                for idx2 in range(idx1+1, len(caption_words)-1):
                    for idx3 in range(idx2+1, len(caption_words)):
                        words_tuple = (caption_words[idx1], caption_words[idx2], caption_words[idx3])
                        current_count = words_tupple_count_dict.get(words_tuple, 0)
                        words_tupple_count_dict[words_tuple] = current_count + 1
    
    # adding cluster name tuple to list
    clusters_name_lst.append(max(words_tupple_count_dict, key=words_tupple_count_dict.get))
    
    # printing proccess percentage
    print_process("Producing names to clusters...", cluster, OPTIMAL_CLUSTERS_NUMBER - 1)

#### printing clusters names

In [None]:
clusters_names_df = pd.DataFrame({'cluster number':[i for i in range(OPTIMAL_CLUSTERS_NUMBER)],
                                  'cluster name': clusters_name_lst }).set_index("cluster number")

clusters_names_file_url = data_set_folder_url +"cluster-names.csv"
clusters_names_df.to_csv(clusters_names_file_url)
display(clusters_names_df)

#### Creating cluster's captions dataframe, saving it to file and printing it

### method 2: td-idf

#### creating clusters decuments

In [None]:
clusters_documents_list = list()
all_captions_list = posts_df[CAPTIONS_COLUMN_NAME]

# scanning clusters to build document for each cluster
for cluster in range(OPTIMAL_CLUSTERS_NUMBER):
    # extracting all captions that belongs to the current cluster
    cluster_captions_list = [all_captions_list[idx] for idx in posts_indexes_per_cluster_lst[cluster]]
    # creating current cluster document (appending all captions togther)
    clusters_documents_list[cluster] = ' '.join(cluster_captions_list)


In [None]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_df=0.5)

# Compute TF-IDF scores
# the return value is a matrix:
#   - row i describe the tfidf score of each word in document i
#   - column j describe the tfidf score for word j in each document
# the order of the columns correspond to the words list we receive from vectorizer.get_feature_names_out() function
tfidf_score_array = vectorizer.fit_transform(clusters_documents_list).toarray()

# Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()

In [None]:
# creating tf-idf score dataframe
# - column i include the tf-idf score for term i in all documents
# - row i include terms score in document i
# terms_tfidf_score_in_documents_df(i,j) = tf-idf score for term j in document i
terms_tfidf_score_in_documents_df = pd.DataFrame(tfidf_score_array, columns = feature_names)

display(terms_tfidf_score_in_documents_df)

In [None]:
for i,word in enumerate(feature_names):
    print(str(i)+". "+word)

In [None]:
assert 1==0, "end of program :)"

In [None]:
# some code that would be needed for td-idf

# dict to each cluster captions (key-cluster number, value-captions list)
# will be used to save clusters caption in csv file and for tf-idf documents
cluster_captions_dict = dict()

# saving clusters clean captions
cluster_captions = [clean_captions_list[idx] for idx in posts_indexes_per_cluster_lst[cluster]]
cluster_captions_dict[cluster_dict_key] = cluster_captions

# soft clean code

clean_caption = ''.join([char for char in row[CAPTIONS_COLUMN_NAME] if ord(char) < 128])
# removing all extra spaces and new-lines
clean_caption = ' '.join([word for word in clean_caption.split()])


### need to remmber what does this code do!
# a list to store terms-score dictionary for each cluster
cluster_terms_score_dict_list = list()

for idx in range(len(tfidf_score_array)):
    cluster_terms_score_dict = dict(zip(feature_names, tfidf_score_array[idx]))
    # removing common words from dict
    cluster_terms_score_dict = {k: v for k, v in cluster_terms_score_dict.items() if k not in common_words_set}
    # Sort the dictionary by values in descending order
    cluster_terms_score_dict = sorted(cluster_terms_score_dict.items(), key=lambda x: x[1], reverse=True)
    # append dictionary to dictionaries list
    cluster_terms_score_dict_list.append(cluster_terms_score_dict)
    
cluster_terms_score_dict_list[1]

#### code to create df which include clusters captions
# creating df from dict
cluster_captions_df = pd.concat({key: pd.Series(vals) for key, vals in cluster_captions_dict.items()}, axis=1)
# saving into csv file
clusters_caption_file_url = data_set_folder_url +"clusters-captions.csv"
cluster_captions_df.to_csv(clusters_caption_file_url)
# display dataframe for fast preview
display(cluster_captions_df.head(5))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data' ]

tr_idf_model  = TfidfVectorizer(max_df = 0.5)
tf_idf_array = tr_idf_model.fit_transform(corpus).toarray()
words_set = tr_idf_model.get_feature_names_out()
df_tf_idf = pd.DataFrame(tf_idf_array, columns = words_set)

display(df_tf_idf)

In [None]:
import pandas as pd
import numpy as np

words_set = set()

for doc in  documents:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
n_docs = len(documents)         #·Number of documents in the corpus
n_words_set = len(words_set) #·Number of unique words in the 

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
