In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import random as rn
import re
import nltk
import os
import collections
import pickle
from collections import Counter
import umap
import hdbscan
from functools import partial
from hyperopt import fmin, tpe, STATUS_OK, Trials, hp, space_eval
from sentence_transformers import SentenceTransformer
import spacy
from spacy import displacy
import os

os.chdir('/content/drive/MyDrive/PLP Proj')

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')
nltk.download('stopwords')

# stop_words = set(stopwords.words('english'))
# custom_meme_stopwords = [
#     "game", "games", "play", "player", "get", "one", "good", "really", "even", "would", "like",
#     "http", "https", "cancer", "aids", "autism", "emily", "depression", "grandfather", "gave",
#     "symbol", "lﾉ", "ーﾉ", "hats", "hat", "tea", "coffee", "drill", "fixing", "hand", "hands",
#     "finger", "fingers", "rats", "rat", "goat", "dinosaurs", "lemon", "lemons", "lemonade",
#     "banana", "cake", "sugar", "butter", "cup", "russian", "akbar", "allahu", "hitler", "china", "ccp",
#     "blah", "um", "mhe", "ca"
# ]

# stop_words.update(custom_meme_stopwords)
# def remove_stopwords(text):
#     if not isinstance(text, str):
#         return ""
#     tokens = word_tokenize(text.lower())
#     filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
#     return " ".join(filtered_tokens)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

pd.set_option("display.max_rows", 600)
pd.set_option("display.max_columns", 500)
pd.set_option("max_colwidth", 400)

nlp = spacy.load("en_core_web_sm")


In [None]:
# df = pd.read_csv('final_data_v4.csv')
# df['review_text_clean_no_stopwords'] = df['review_text_clean'].apply(remove_stopwords)
# df['review_text_clean_no_stopwords'] = df['review_text_clean_no_stopwords'].dropna()
# df.to_csv('final_data_v4.csv', index=False)

In [None]:
def generate_clusters(message_embeddings,
                      n_neighbors,
                      n_components,
                      min_cluster_size,
                      min_samples = None,
                      random_state = None):
    """
    Returns HDBSCAN objects after first performing dimensionality reduction using UMAP

    Arguments:
        message_embeddings: embeddings to use
        n_neighbors: int, UMAP hyperparameter n_neighbors
        n_components: int, UMAP hyperparameter n_components
        min_cluster_size: int, HDBSCAN hyperparameter min_cluster_size
        min_samples: int, HDBSCAN hyperparameter min_samples
        random_state: int, random seed

    Returns:
        clusters: HDBSCAN object of clusters
    """

    umap_embeddings = (umap.UMAP(n_neighbors = n_neighbors,
                                n_components = n_components,
                                metric = 'cosine',
                                random_state=random_state)
                            .fit_transform(message_embeddings))

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                               min_samples = min_samples,
                               metric='euclidean',
                               gen_min_span_tree=True,
                               cluster_selection_method='eom').fit(umap_embeddings)

    return clusters

def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given clustering

    Arguments:
        clusters: HDBSCAN clustering object
        prob_threshold: float, probability threshold to use for deciding
                        what cluster labels are considered low confidence

    Returns:
        label_count: int, number of unique cluster labels, including noise
        cost: float, fraction of data points whose cluster assignment has
              a probability below cutoff threshold
    """

    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)

    return label_count, cost

def objective(params, embeddings, label_lower, label_upper):
    """
    Objective function for hyperopt to minimize

    Arguments:
        params: dict, contains keys for 'n_neighbors', 'n_components',
               'min_cluster_size', 'random_state' and
               their values to use for evaluation
        embeddings: embeddings to use
        label_lower: int, lower end of range of number of expected clusters
        label_upper: int, upper end of range of number of expected clusters

    Returns:
        loss: cost function result incorporating penalties for falling
              outside desired range for number of clusters
        label_count: int, number of unique cluster labels, including noise
        status: string, hypoeropt status

        """

    clusters = generate_clusters(embeddings,
                                 n_neighbors = params['n_neighbors'],
                                 n_components = params['n_components'],
                                 min_cluster_size = params['min_cluster_size'],
                                 random_state = params['random_state'])

    label_count, cost = score_clusters(clusters, prob_threshold = 0.05) # 0.05

    #15% penalty on the cost function if outside the desired range of groups
    if (label_count < label_lower) | (label_count > label_upper):
        penalty = 1.0 #0.5
    else:
        penalty = 0

    loss = cost + penalty

    return {'loss': loss, 'label_count': label_count, 'status': STATUS_OK}

def bayesian_search(embeddings, space, label_lower, label_upper, max_evals=100):
    """
    Perform bayesian search on hyperparameter space using hyperopt

    Arguments:
        embeddings: embeddings to use
        space: dict, contains keys for 'n_neighbors', 'n_components',
               'min_cluster_size', and 'random_state' and
               values that use built-in hyperopt functions to define
               search spaces for each
        label_lower: int, lower end of range of number of expected clusters
        label_upper: int, upper end of range of number of expected clusters
        max_evals: int, maximum number of parameter combinations to try

    Saves the following to instance variables:
        best_params: dict, contains keys for 'n_neighbors', 'n_components',
               'min_cluster_size', 'min_samples', and 'random_state' and
               values associated with lowest cost scenario tested
        best_clusters: HDBSCAN object associated with lowest cost scenario
                       tested
        trials: hyperopt trials object for search

        """

    trials = Trials()
    fmin_objective = partial(objective,
                             embeddings=embeddings,
                             label_lower=label_lower,
                             label_upper=label_upper)

    best = fmin(fmin_objective,
                space = space,
                algo=tpe.suggest,
                max_evals=max_evals,
                trials=trials)

    best_params = space_eval(space, best)
    print ('best:')
    print (best_params)
    print (f"label count: {trials.best_trial['result']['label_count']}")

    best_clusters = generate_clusters(embeddings,
                                      n_neighbors = best_params['n_neighbors'],
                                      n_components = best_params['n_components'],
                                      min_cluster_size = best_params['min_cluster_size'],
                                      random_state = best_params['random_state'])

    return best_params, best_clusters, trials

In [None]:
def get_group(df, category_col, category):
    """
    Returns documents of a single category

    Arguments:
        df: pandas dataframe of documents
        category_col: str, column name corresponding to categories or clusters
        category: int, cluster number to return
    Returns:
        single_category: pandas dataframe with documents from a single category
    """

    single_category = df[df[category_col]==category].reset_index(drop=True)

    return single_category

def most_common(lst, n_words):
    """
    Get most common words in a list of words

    Arguments:
        lst: list, each element is a word
        n_words: number of top common words to return

    Returns:
        counter.most_common(n_words): counter object of n most common words
    """
    counter=collections.Counter(lst)

    for k in list(counter):
        if counter[k] ==1: # if appears only once, ignore it.
            pass
        else:
            counter[k] *= word_IDF[k] # if the word appears more than once in the entire cluser,
                                      # repeat that word "IDF" times in our bag. If a word is
                                      # low-frequent word it has a high IDF values, so, with this
                                      # technique we give more chance to this word to show up in
                                      # the list of most common words

    return counter.most_common(n_words)

def extract_labels(category_docs, print_word_counts=False):
    """
    Extract labels from documents in the same cluster by concatenating
    most common verbs, ojects, and nouns

    Argument:
        category_docs: list of documents, all from the same category or
                       clustering
        print_word_counts: bool, True will print word counts of each type in this category

    Returns:
        label: str, group label derived from concatentating most common
               verb, object, and two most common nouns

    """

    verbs = []
    dobjs = []
    nouns = []
    adjs = []

    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    # for each document, append verbs, dobs, nouns, and adjectives to
    # running lists for whole cluster
    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if (token.is_stop==False) and (len(str(token).strip()) > 0):
                # ignore if it is a stop word or the length of stripped token is less than 1!
                if token.pos_ == 'VERB':
                    verbs.extend([token.lemma_.lower()])

                elif token.dep_=='dobj':
                    dobjs.extend([token.lemma_.lower()])

                elif token.pos_=='NOUN':
                    nouns.extend([token.lemma_.lower()])

                elif token.pos_=='ADJ':
                    adjs.extend([token.lemma_.lower()])

    # for printing out for inspection purposes
    if print_word_counts:
        for word_lst in [verbs, dobjs, nouns, adjs]:
            counter=collections.Counter(word_lst)
            print(counter)

    # take most common words of each form
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]

    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]

    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]

    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]

    # concatenate the most common verb-dobj-noun1-noun2 (if they exist)
    label_words = [verb, dobj]

    for word in [noun1, noun2]:
        if word not in label_words:
            label_words.append(word)

    if '' in label_words:
        label_words.remove('')

    label = '_'.join(label_words)

    return label

def apply_and_summarize_labels(df, category_col):
    """
    Assign groups to original documents and provide group counts

    Arguments:
        df: pandas dataframe of original documents of interest to
            cluster
        category_col: str, column name corresponding to categories or clusters

    Returns:
        summary_df: pandas dataframe with model cluster assignment, number
                    of documents in each cluster and derived labels
    """

    numerical_labels = df[category_col].unique()

    # create dictionary of the numerical category to the generated label
    label_dict = {}
    for label in numerical_labels:
        current_category = list(get_group(df, category_col, label)['text'])
        label_dict[label] = extract_labels_improved(current_category, all_intents)

    # create summary dataframe of numerical labels and counts
    summary_df = (df.groupby(category_col)['text'].count()
                    .reset_index()
                    .rename(columns={'text':'count'})
                    .sort_values('count', ascending=False))

    # apply generated labels
    summary_df['label'] = summary_df.apply(lambda x: label_dict[x[category_col]], axis = 1)

    return summary_df


In [None]:
domain_stopwords = ['play', 'game', 'time', 'level', 'player', 'steam']

def extract_labels_improved(category_docs, all_docs, print_word_counts=False):
    docs = list(nlp.pipe(category_docs))

    verbs = [token.lemma_.lower() for doc in docs for token in doc
             if not token.is_stop and len(str(token).strip()) > 0
             and token.pos_ == 'VERB' and token.lemma_.lower() not in domain_stopwords]

    dobjs = [token.lemma_.lower() for doc in docs for token in doc
             if not token.is_stop and len(str(token).strip()) > 0
             and token.dep_ == 'dobj' and token.lemma_.lower() not in domain_stopwords]

    nouns = [token.lemma_.lower() for doc in docs for token in doc
             if not token.is_stop and len(str(token).strip()) > 0
             and token.pos_ == 'NOUN' and token.lemma_.lower() not in domain_stopwords]

    adjs = [token.lemma_.lower() for doc in docs for token in doc
            if not token.is_stop and len(str(token).strip()) > 0
            and token.pos_ == 'ADJ' and token.lemma_.lower() not in domain_stopwords]

    all_words_freq = collections.Counter([w for doc in all_docs for w in doc.split()])
    this_cluster_freq = collections.Counter([w for doc in category_docs for w in doc.split()])

    distinctiveness = {}
    for word in set(verbs + dobjs + nouns):
        if word in this_cluster_freq and word in all_words_freq:
            cluster_rel_freq = this_cluster_freq[word] / sum(this_cluster_freq.values())
            all_rel_freq = all_words_freq[word] / sum(all_words_freq.values())
            distinctiveness[word] = cluster_rel_freq / all_rel_freq

    top_words = sorted(distinctiveness.items(), key=lambda x: x[1], reverse=True)[:3]

    if len(top_words) == 0:
        if verbs:
            top_words.append((collections.Counter(verbs).most_common(1)[0][0], 0))
        if nouns:
            top_words.append((collections.Counter(nouns).most_common(1)[0][0], 0))

    label = '_'.join([word for word, _ in top_words])

    if not label:
        label = "other_topic"

    return label


In [None]:
def auto_label_reviews(reviews, topic_model=None, model_path='steam_review_topic_model.pkl'):

    if topic_model is None:
        with open(model_path, 'rb') as f:
            topic_model = pickle.load(f)

    best_params = topic_model['best_params']
    best_clusters = topic_model['best_clusters']
    cluster_summary = topic_model['cluster_summary']
    embeddings = topic_model['embeddings']

    if isinstance(reviews, pd.Series):
        reviews = reviews.tolist()
    elif isinstance(reviews, str):
        reviews = [reviews]

    sentences_with_origin = [(sent, i) for i, review in enumerate(reviews)
                             for sent in nltk.sent_tokenize(review)
                             if len(sent.split()) > 4]

    if not sentences_with_origin:
        return pd.DataFrame({'review': reviews, 'topic': [None] * len(reviews)})

    processed_reviews, review_origins = zip(*sentences_with_origin)

    model_st = SentenceTransformer('all-mpnet-base-v2')
    new_embeddings = model_st.encode(processed_reviews, show_progress_bar=True)

    umap_model = umap.UMAP(
        n_neighbors=best_params['n_neighbors'],
        n_components=best_params['n_components'],
        metric='cosine',
        random_state=best_params['random_state']
    )

    original_umap_embeddings = umap_model.fit_transform(embeddings)

    new_umap_embeddings = umap_model.transform(new_embeddings)

    from sklearn.neighbors import NearestNeighbors

    nbrs = NearestNeighbors(n_neighbors=5).fit(original_umap_embeddings)

    distances, indices = nbrs.kneighbors(new_umap_embeddings)

    predicted_labels = []
    for idx_set in indices:
        neighbor_labels = [best_clusters.labels_[i] for i in idx_set]
        label_counts = collections.Counter(neighbor_labels)
        if len(label_counts) > 1 and -1 in label_counts:
            del label_counts[-1]
        if label_counts:
            predicted_labels.append(label_counts.most_common(1)[0][0])
        else:
            predicted_labels.append(-1)

    sentence_results = pd.DataFrame({
        'text': processed_reviews,
        'review_idx': review_origins,
        'cluster_label': predicted_labels
    })

    label_map = dict(zip(cluster_summary['label_st1'], cluster_summary['label']))
    sentence_results['topic'] = sentence_results['cluster_label'].map(label_map)

    review_topics = {}
    for i in range(len(reviews)):
        review_sentences = sentence_results[sentence_results['review_idx'] == i]
        if len(review_sentences) == 0:
            review_topics[i] = None
        else:
            topics = [t for t in review_sentences['topic'].tolist() if t is not None]
            if not topics:
                review_topics[i] = None
            else:
                review_topics[i] = collections.Counter(topics).most_common(1)[0][0]

    labeled_reviews = pd.DataFrame({
        'review': reviews,
        'topic': [review_topics[i] for i in range(len(reviews))]
    })

    return labeled_reviews

In [None]:
df = pd.read_csv('final_data_v4.csv')
grouped = df.groupby(['app_id', 'review_score'])

num_groups = grouped.ngroups
samples_per_group = max(1, 100000 // num_groups)

df_sampled = grouped.apply(lambda g: g.sample(n=min(len(g), samples_per_group), random_state=42))

df_sampled.reset_index(drop=True, inplace=True)

if len(df_sampled) < 100000:
    remaining = 100000 - len(df_sampled)
    df_rest = df.drop(df_sampled.index, errors='ignore')
    df_extra = df_rest.sample(n=remaining, random_state=42)
    df_sampled = pd.concat([df_sampled, df_extra], ignore_index=True)

print(f"Final sample size: {len(df_sampled)} 行")

df_sampled["review_text_clean_no_stopwords"] = df_sampled["review_text_clean_no_stopwords"].astype(str)

all_intents = df_sampled.review_text_clean_no_stopwords.tolist()
all_sents = [sent for intent in all_intents
            for sent in nltk.sent_tokenize(intent)
            if len(sent.split()) > 4]
print(len(all_sents))
all_intents = all_sents

model_st1 = SentenceTransformer('all-mpnet-base-v2')
# model_st2 = SentenceTransformer('all-MiniLM-L6-v2')
# model_st3 = SentenceTransformer('all-distilroberta-v1')

embeddings_st1 = model_st1.encode(all_intents)

  df_sampled = grouped.apply(lambda g: g.sample(n=min(len(g), samples_per_group), random_state=42))


最终样本数量: 100000 行
84469


In [None]:
hspace = {
    "n_neighbors": hp.choice('n_neighbors', range(20, 80)),
    "n_components": hp.choice('n_components', range(10, 25)),
    "min_dist": hp.uniform('min_dist', 0.1, 0.5),
    "min_cluster_size": hp.choice('min_cluster_size', range(10, 150)),
    "min_samples": hp.choice('min_samples', range(5, 30)),
    "cluster_selection_epsilon": hp.uniform('cluster_selection_epsilon', 0.01, 0.25),
    "random_state": None
}

label_lower = 20
label_upper = 500
max_evals = 25 # change it to 50 or 100 for extra steps as you wish.

best_params_use, best_clusters_use, trials_use = bayesian_search(embeddings_st1,
                                  space=hspace,
                                  label_lower=label_lower,
                                  label_upper=label_upper,
                                  max_evals=max_evals)



  0%|          | 0/2 [00:00<?, ?trial/s, best loss=?]






 50%|█████     | 1/2 [01:42<01:42, 102.95s/trial, best loss: 0.7037848204666801]






100%|██████████| 2/2 [03:33<00:00, 106.62s/trial, best loss: 0.6492085853981934]
best:
{'cluster_selection_epsilon': 0.10335371058898452, 'min_cluster_size': 110, 'min_dist': 0.10094290349585076, 'min_samples': 9, 'n_components': 19, 'n_neighbors': 79, 'random_state': None}
label count: 24




In [None]:
data_clustered = pd.DataFrame(data = list(zip(all_intents,best_clusters_use.labels_)),
                             columns = ['text', 'label_st1'])
data_clustered.head()

docs = list(nlp.pipe(all_intents))
sent_with_word_lemma = [
    " ".join([token.lemma_.lower() for token in doc
              if (token.pos_ in ['VERB', 'NOUN', 'ADJ'] or token.dep_=='dobj')])
    for doc in docs
]


def compute_IDF(documents):
    word_count = Counter()
    for doc in documents:
        if 'drops(players' in doc:
            print(doc)
            print(doc.split())
        words_set = set(doc.split())
        word_count.update(words_set)
    total = sum(word_count.values())
    return {k: round((np.log2(total / v)))  for k, v in word_count.items()} # log2 is the best choice for our work (feel free)
                                                                            # to try different functions.

word_IDF = compute_IDF(sent_with_word_lemma)


In [None]:
cluster_summary = apply_and_summarize_labels(data_clustered, 'label_st1')

topic_model = {
    'best_params': best_params_use,
    'best_clusters': best_clusters_use,
    'cluster_summary': cluster_summary,
    'word_IDF': word_IDF,
    'embeddings': embeddings_st1
}

with open('steam_review_topic_model.pkl', 'wb') as f:
    pickle.dump(topic_model, f)

print("Topic model saved to 'steam_review_topic_model.pkl'")

Topic model saved to 'steam_review_topic_model.pkl'


In [None]:
with open('steam_review_topic_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

new_reviews = [
    "The game crashes constantly and the developers don't seem to care about fixing it.",
    "Controls are terrible and the UI is confusing. Waste of money.",
    "Beautiful graphics and engaging story, but optimization is poor on older systems."
]

labeled_results = auto_label_reviews(new_reviews, loaded_model)
print(labeled_results)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]



                                                                               review  \
0  The game crashes constantly and the developers don't seem to care about fixing it.   
1                      Controls are terrible and the UI is confusing. Waste of money.   
2   Beautiful graphics and engaging story, but optimization is poor on older systems.   

                        topic  
0  dimmdrive_norepair_appachy  
1   æthereus_deathspank_yoshi  
2  dimmdrive_norepair_appachy  


In [None]:
cluster_summary

Unnamed: 0,label_st1,count,label
0,-1,56538,æthereus_deathspank_yoshi
17,16,8010,labyronia_prunk_souce
22,21,4134,dimmdrive_norepair_appachy
15,14,3533,withdrawl_propping_gaes
9,8,2617,takyon_shem_blyad
5,4,1460,prixs_controllermap_astana
23,22,921,ponyrp_platnium_baymax
21,20,912,urage_regrett_backpeddal
18,17,731,gimmickey_decellerate_sportsman
1,0,602,gunjack_vrideo_hmds


In [None]:
with open('steam_review_topic_model_improved.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

print(loaded_model['cluster_summary'][['label_st1', 'label']].head())


# loaded_model['cluster_summary']['label'] = loaded_model['cluster_summary']['label_st1'].map(
#     lambda x: improved_labels.get(x, f"Topic {x}")
# )


    label_st1  \
0          -1   
17         16   
22         21   
15         14   
9           8   

                                                                                                label  
0                General Gameplay Issues (games Æthereus, DeathSpank, and Nintendo's Yoshi character)  
17                Game Performance Problems (Labyronia RPG game, drunk state, and source code issues)  
22  Technical Issues & Crashes (Dimmdrive RAM software, unrepairable issues, and application crashes)  
15              Refund Requests & Complaints (withdrawal/refunds, game props, and misspelled 'games')  
9                             Game Balance & Difficulty (tachyon particles, shame, and Russian slang)  


In [None]:
improved_labels = {
    -1: "General Gameplay Issues (games Æthereus, DeathSpank, and Nintendo's Yoshi character)",
    16: "Game Performance Problems (Labyronia RPG game, drunk state, and source code issues)",
    21: "Technical Issues & Crashes (Dimmdrive RAM software, unrepairable issues, and application crashes)",
    14: "Refund Requests & Complaints (withdrawal/refunds, game props, and misspelled 'games')",
    8: "Game Balance & Difficulty (tachyon particles, shame, and Russian slang)",
    4: "Controller & Input Configuration (prices, controller mapping, and possibly the city Astana)",
    22: "Multiplayer Experience (pony roleplay games, platinum editions, and Big Hero 6 character)",
    20: "Negative Emotional Reactions (user rage, regret, and backpedaling on opinions)",
    17: "Game Mechanics Criticism (gimmicky features, deceleration mechanics, and sports-themed content)",
    0: "VR Gaming Experience (Gunjack VR game, VR video, and head-mounted displays)",
    5: "Multiplayer Functionality (blood/gore, multiplayer features, and simplicity/simplified gameplay)",
    1: "Online Community Interaction (Manchester United, North American Hockey League, and internet slang)",
    18: "Puzzle Game Experience (Attractio puzzle game, mind games, and Minesweeper)",
    13: "Music & Sound Design (Rock Band games, masterpiece misspelling, and SoundDodger game)",
    19: "Infinite Loading & Progression (infinity/infinite loading, Venn diagrams, and unidentified abbreviations)",
    3: "Character Development Issues (characters lacking personality, Bozak character, and Large Hadron Collider reference)",
    7: "Graphics & Visual Effects (GLSL shaders, CamStudio software, and effects mapping)",
    6: "Combat System Complaints (cinders/fire, annihilation mechanics, and recognition features)",
    23: "Ranking System Problems (happiness, misspelled stupidity, and deranking in competitive games)",
    15: "Community & Social Features (misspelled community and attribute, about features that made the game)",
    12: "Character Creation & Customization (character growth, character generation, and unknown abbreviation)",
    24: "Game Crashes & Stability (game crashes, sad quitting, and connection issues)",
    10: "UI Visibility & Customization (invisible elements, misspelled customization, and announcements)",
    11: "Overall Gaming Experience (misspelled gaming, joy aspects, and ratings)",
    2: "Playtime & First Impressions (hours played, misspelled impressions, and covenant/agreements)",
    9: "Developer Criticism & Updates (misspelled wouldn't, Treyarch developer, and accusations of greed)"
}


# cluster_summary['label'] = cluster_summary['label_st1'].map(lambda x: improved_labels.get(x, f"Topic {x}"))
# cluster_summary['original_label'] = cluster_summary['label']  # 保存一个副本用于参考


loaded_model['cluster_summary']['label'] = loaded_model['cluster_summary']['label_st1'].map(
    lambda x: improved_labels.get(x, f"Topic {x}")
)

with open('steam_review_topic_model_improved.pkl', 'wb') as f:
    pickle.dump(loaded_model, f)

In [None]:
new_reviews = [
    "The game crashes constantly and the developers don't seem to care about fixing it."
]
labeled_results = auto_label_reviews(new_reviews, loaded_model)
print(labeled_results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



                                                                               review  \
0  The game crashes constantly and the developers don't seem to care about fixing it.   

                        topic  
0  Technical Issues & Crashes  
