# READ DICTIONARY

In [None]:
from gensim.corpora import Dictionary

# Load the dictionary from file
dictionary = Dictionary.load('dictionary')
print('Dictionary length:', len(dictionary), '\n')

import json

# Load data from reference_sheet.json
with open('reference_sheet.json', 'r') as file:
    json_data = file.read()
reference_sheet = json.loads(json_data)

# Load data from topic_reference_sheet.json
with open('topic_reference_sheet.json', 'r') as file:
    json_data = file.read()
topic_reference_sheet = json.loads(json_data)

# READ CORPUS

In [None]:
import pandas as pd
import numpy as np
import random
import json
import ast
import os

pd.set_option('display.max_columns', None)

# Set the path to the original data folder
path_original_data = r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'

# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(path_original_data, 'post_processed_comment_data_demo.csv'), low_memory=False)

# Extract the 'gensim_comment' column as a list of strings
string_list = df['gensim_comment'].tolist()

# Convert the string representation of lists to actual lists
comments_list = [ast.literal_eval(s) for s in string_list]

# Convert the comments to bag-of-words representation using the loaded dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in comments_list]
print('Length of the corpus:', len(bow_corpus), '\n')

# Choose a random document from the corpus
random_number = round(random.uniform(0, len(bow_corpus)))
bow_doc_x = bow_corpus[random_number]

# Print the word count for each word in the chosen document
for i in range(len(bow_doc_x)):
    print("Word {} ('{}') appears {} time.".format(bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1]))


# TRAIN MODEL

### IMPORTANT VARIABLE

In [1]:
# NUMBER OF TOPICS
topic_num = 30

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import models

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

# Create an instance of LdaMulticore model
lda = gensim.models.LdaMulticore(
    corpus=bow_corpus,    # The bag-of-words corpus
    num_topics=topic_num, # Number of topics to generate
    id2word=dictionary,   # Mapping of word IDs to words
    passes=50,            # Number of passes through the corpus
    workers=2             # Number of worker processes
)

# Alternative approach using LdaModel
# lda = gensim.models.LdaModel(
#     corpus=bow_corpus,
#     num_topics=topic_num,
#     id2word=dictionary,
#     passes=50
# )

In [None]:
import os
from gensim import models

# Set the path to the original data folder
path_original_data = r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'

# Create the path for the temporary file
temp_file = os.path.join(path_original_data, 'lda_model')

# Save the LdaModel to the temporary file
# lda.save(temp_file)

# Load the LdaModel from the temporary file
lda = models.ldamodel.LdaModel.load(temp_file)

In [None]:
topics_read = list(range(0, topic_num))
topic_dict = {}

# Loop until all topics have been read
while len(topics_read) > 0:
    # Iterate over the topics
    for topic in lda.print_topics():
        topic_id, topic_words = topic
        # Check if the current topic is in the unread topics list
        if topic_id in topics_read:
            topics_read.remove(topic_id)
            words = topic_words.split(' + ')
            topic_dict[topic_id] = {}
            # Process each word in the topic
            for i, word in enumerate(words):
                word = word.split('"')
                # Check if the word is in the reference_sheet
                if word[1] in reference_sheet:
                    topic_dict[topic_id][word[1]] = (word[0].replace('*', ''), reference_sheet[word[1]])
                else:
                    topic_dict[topic_id][word[1]] = (word[0].replace('*', ''), [word[1].upper()])
    print('- Still unread:', topics_read)

# RESULTING TOPICS

In [None]:
from fuzzywuzzy import fuzz

def find_closest_match(input_string, string_list):
    highest_ratio = 0
    closest_string = None

    # Iterate over the string list
    for string in string_list:
        # Calculate the fuzz ratio between the input string and the current string
        ratio = fuzz.ratio(input_string, string)
        # Update the highest ratio and closest string if the current ratio is higher
        if ratio > highest_ratio:
            highest_ratio = ratio
            closest_string = string

    return closest_string

# Sort the topic_dict by topic key in ascending order
ordered_dict = dict(sorted(topic_dict.items(), key=lambda x: int(x[0])))

topic_assignment = {}

# Iterate over the ordered_dict
for topic_key, topic_value in ordered_dict.items():
    print('TOPIC:', topic_key)
    
    # Initialize the topic assignment dictionary for the current topic
    topic_assignment[topic_key] = {}
    
    # Iterate over the words and probabilities in the current topic
    for key, value in topic_value.items():
        wprob, wlist = value
        
        word_list = []
        
        # Split each string in wlist and add the words to word_list
        for string in wlist:
            words = string.split()
            word_list.extend(words)
        
        # Find the closest match for the key in the word_list
        closest_match = find_closest_match(key, word_list)
        
        # If no closest match is found, set it to the shortest string in wlist with an asterisk
        if closest_match is None:
            closest_match = min(wlist, key=len) + '*'
        
        # If the closest match is in uppercase, set it to '-'
        if closest_match.isupper():
            closest_match = '-'
        
        # Print the word and the closest match
        print(key, ':', closest_match.lower())
        
        # Check if the word is in the topic_reference_sheet
        if key in topic_reference_sheet:
            # Assign the corresponding value to topic_origin in topic_assignment
            topic_assignment[topic_key][key] = topic_reference_sheet[key]
            
            # If there are multiple topic origins, join them with commas
            if len(topic_reference_sheet[key]) > 1:
                topic_origin = ', '.join(topic_reference_sheet[key])
            else:
                topic_origin = topic_reference_sheet[key][0]
        else:
            # If the word is not in the topic_reference_sheet, set topic_origin to '-'
            topic_origin = '-'
            
            # Store the topic_origin in topic_assignment
            topic_assignment[topic_key][key] = topic_origin
        
        # Print the word and its corresponding topic origin
        print('*', key, ':', topic_origin)
    
    print('\n')

# LABEL TOPICS

#### WARNING
The names written may not be always representive of the topics at the current moment since even without major changes in content the order they are shown can change.

Check before using the manually assigned names

In [None]:
topic_names = {0: "Actions", 1: "NA", 2: "NA", 3: "Learning", 4: "Game design", 5: "Number of players",
              6: "Number of players", 7: "Game duration", 8: "Rules", 9: "Problem-solving", 10: "Release",
              11: "Gameplay", 12: "Concept", 13: "Scoring", 14: "Artwork", 15: "D&D", 16: "Strategy",
              17: "Recommendation", 18: "Favourite", 19: "Mechanics", 20: "Actions", 21: "Favourite",
              22: "NA", 23: "D&D", 24: "Interaction", 25: "Version", 26: "Luck", 27: "Winning",
              28: "Length", 29: "Components"}

my_topics = {"bookeeping": [28, 8], "downtime": [24], "interaction": [4, 9], "bash the leader": [7, 13, 27],
             "complex or complicated": [3, 16, 20, 0], "luck": [26]}

### IMPORTANT VARIABLE

In [None]:
# NUMBER OF TIMES A CATEGORY HAS TO APPEAR IN A TOPIC TO BE CONSIDERED AS THE TOPIC
topic_bound = 4

# THE MAX DIFFERENCE BETWEEN TO ELEMENTS TO CONSIDERED JOINING THEM
union_bound = 2

# If the difference is too big that means one of the categories can stand on its own, if it's less then combining may boost its probabilities.

In [None]:
def return_draw(my_dict):
    # Retrieve the first and second elements from the dictionary
    first_element = list(my_dict.keys())[0]
    second_element = list(my_dict.keys())[1]
    # Retrieve the values corresponding to the first and second elements
    first_value = my_dict[first_element]
    second_value = my_dict[second_element]
    
    # Check if the values are equal
    if first_value == second_value:
        # If equal, find all keys with the same value
        matching_keys = [key for key, value in my_dict.items() if value == first_value]
    else:
        # If not equal, consider only the first element
        matching_keys = [first_element]

    return matching_keys

topic_estimation = {}
topic_classification = {}

# Iterate over the topic_assignment dictionary
for topic_key, topic_value in topic_assignment.items():
    topic_count = {}
    # Iterate over the values in the topic_value dictionary
    for key, value in topic_value.items():
        # Iterate over each topic in the value list
        for topic in value:
            # Increment the count for each topic in the topic_count dictionary
            current = topic_count.setdefault(topic, 0)
            topic_count[topic] = current + 1
    
    # Sort the topic_count dictionary by count in descending order
    sorted_dict = dict(sorted(topic_count.items(), key=lambda x: x[1], reverse=True))
    # Remove the '-' key from the sorted_dict if present
    if '-' in sorted_dict:
        del sorted_dict['-']
    
    # Assign the sorted_dict to the corresponding topic key in topic_estimation
    topic_estimation[topic_key] = sorted_dict        

# Iterate over the topic_estimation dictionary
for key, value in topic_estimation.items():
    key_to_fuse1 = 'complex'
    key_to_fuse2 = 'complicated'
    # Retrieve the values for key_to_fuse1 and key_to_fuse2 from the value dictionary
    value1 = value.setdefault(key_to_fuse1, 0)
    value2 = value.setdefault(key_to_fuse2, 0)
    # Calculate the sum of the values
    sum_value = value1 + value2
    # Update the value dictionary with the fused key and sum value
    value["complex or complicated"] = sum_value
    # Remove the individual keys from the value dictionary
    del value[key_to_fuse1]
    del value[key_to_fuse2]
    
    # Additional fusion example (commented out)
    # key_to_fuse1 = 'downtime'
    # key_to_fuse2 = 'interaction'
    # value1 = value.setdefault(key_to_fuse1, 0)
    # value2 = value.setdefault(key_to_fuse2, 0)
    # if abs(value1 - value2) <= union_bound:
    #     sum_value = value1 + value2
    #     value["game dynamics"] = sum_value
    #     del value[key_to_fuse1]
    #     del value[key_to_fuse2]
    
    # Sort the value dictionary by count in descending order
    value = dict(sorted(value.items(), key=lambda x: x[1], reverse=True))
    # Update the value in the topic_estimation dictionary
    topic_estimation[key] = value

# Iterate over the topic_estimation dictionary
for key, value in topic_estimation.items():
    # Retrieve the first element from the value dictionary
    first_element = next(iter(value.items()))
    name, count = first_element
    if count >= topic_bound:
        if len(return_draw(value)) == 1:
            # If only one key is returned, assign it as the topic classification
            topic_classification[key] = [name]
        else:
            # Otherwise, assign the returned keys as the topic classification
            topic_classification[key] = return_draw(value)
    else:
        # If the count is below the topic_bound, assign 'NA' as the topic classification
        topic_classification[key] = ['NA']
        
# Print the topic key and its corresponding topic classification
for key, value in topic_classification.items():       
    print(key, '-', value)    


# EXAMINE COMMENTS

### IMPORTANT VARIABLE

In [None]:
# After many iteration the next code is quite convoluded. These are the variables need to change it function.

# NUMBER OF MATCHES WITH THE DICTIONARY NEEDED TO TRUST THE RESULT
# Extremely short messages are prone to have phony results
bound = 5

# MINIMUM PROBABILITY NEEDED TO STORE THE TOPIC DETECTED
# Some comments barely touch some topics, this helps to avoid long list of topics
bound_prob = 0.05

# BOOLEAN - USE THE AUTOMATICALLY LABELLED TOPICS
# Highest priority, overrides the rest of variables
bool_auto = True

# BOOLEAN - USE THE MANUALLY ASSIGNED INDIVIDUAL TOPIC LABELS INSTEAD THE GENERAL TOPIC LABELS
# Second highest priority, overrides the next variable.
bool_title = False

# BOOLEAN - GROUP ALL MANUALLY ASIGNED GENERAL TOPIC LABELS SO THERE ARE NO DUPLICATES
# Lowest priority, if all is false will use the manual general labels with duplicates.
bool_group = True

In [None]:
# Function to print the comment and return the document bag-of-words representation
def print_comment(n):
    target_bow = bow_corpus[n]
    index = next((i for i, bow_element in enumerate(bow_corpus) if bow_element == target_bow), None)

    if index is not None:
        print("Index of the element in `comments_list`:", index, '\n')
    else:
        print("Element not found in `comments_list`.", '\n')

    document_bow = bow_corpus[index]
    document_original = " ".join([dictionary[id] for id, _ in document_bow])

    if len(document_original.split()) <= bound:
        print('- WARNING: Amount of data insufficient.', '\n')

    element = comments_list[index]
    comment = df.loc[index, 'comment']

    print(comment, '\n')

    return document_bow

# Function to find keys in a dictionary with a given value
def find_keys_with_value(dictionary, number):
    return [key for key, values in dictionary.items() if number in values]

import random
from nltk.stem import WordNetLemmatizer
import operator

lemmatizer = WordNetLemmatizer()
random_number = round(random.uniform(0, len(bow_corpus)))

# Get the topics for a randomly selected document and sort them by probability
document_topics = lda.get_document_topics(print_comment(random_number))
sorted_topics = sorted(document_topics, key=operator.itemgetter(1), reverse=True)

topics_detected = []
high_topics_detected = {}
for topic_id, topic_prob in sorted_topics:
    percentage = topic_prob * 100
    formatted_percentage = "{:.2f}%".format(percentage)
    if topic_prob >= bound_prob:
        if bool_auto:
            # If bool_auto is True, accumulate the topic probabilities for each topic
            for topic in topic_classification[topic_id]:
                high_topics_detected[topic] = high_topics_detected.get(topic, 0) + percentage
        else:
            if bool_title:
                # If bool_title is True, include topic names in the detected topics
                if topic_id in topic_names:
                    topics_detected.append(f"[{topic_id}] Topic: {topic_names[topic_id]} - {formatted_percentage}")
                else:
                    topics_detected.append(f"TOPIC {topic_id} - {formatted_percentage}")
            else:
                # If bool_title is False, group topics with the same ID and accumulate their probabilities
                topics = find_keys_with_value(my_topics, topic_id)
                for elem in topics:
                    high_topics_detected.setdefault(elem, []).append(percentage)
                if len(topics) > 1:
                    topics_detected.append(f"{', '.join(topics).upper()} ({topic_id}) - {formatted_percentage}")
                elif len(topics) == 1:
                    topics_detected.append(f"{topics[0].upper()} ({topic_id}) - {formatted_percentage}")

print("Topics detected:")
if bool_auto:
    # If bool_auto is True, print the high_topics_detected dictionary sorted by probabilities
    high_topics_detected = dict(sorted(high_topics_detected.items(), key=lambda x: x[1], reverse=True))
    for key, value in high_topics_detected.items():
        if key != 'NA':
            p = "{:.2f}%".format(value)
            print(f"{key.upper()} - {p}")
else:
    sum_dict = {key: sum(lst) for key, lst in high_topics_detected.items()}
    if len(topics_detected) == 0:
        print('None.')
    else:
        if bool_group and not bool_title:
            # If bool_group is True and bool_title is False, group topics by summing their probabilities
            for key, value in sum_dict.items():
                p = "{:.2f}%".format(value)
                print(f"{key.upper()} - {p}")
        else:
            # Print the individual detected topics
            for topic in topics_detected:
                print(topic)

# SAVE RESULTS

In [None]:
import pandas as pd
import os

# Path to the original data file
path_original_data = r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'

# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(path_original_data, 'post_processed_comment_data_demo.csv'), low_memory=False)

# List of topics
topic_list = ["complex or complicated", "luck", "interaction", "bash the leader", "downtime", "bookeeping"]

# Precompute and store the document topics
document_topics = [lda.get_document_topics(bow) for bow in bow_corpus]

# Define a function to compute topic estimation
def topic_estimation(n, input_topic):
    # Calculate the sum of topic probabilities for the given input topic
    prob_count = sum(topic_prob for topic_id, topic_prob in document_topics[n] if input_topic in topic_classification[topic_id])
    return prob_count

# Define a function to compute keyword matches
def keyword_matches(n):
    # Get the bag-of-words representation for the document
    document_bow = bow_corpus[n]
    document_bow_ids = [id for id, _ in document_bow]
    # Convert the word IDs to their corresponding words in the dictionary
    document_original = " ".join([dictionary[id] for id in document_bow_ids])
    # Count the number of keywords (words in the dictionary) in the document
    return len(document_original.split())

# Add a new column 'dictionary_matches' using vectorized operations
df['dictionary_matches'] = df.index.map(keyword_matches)

# Create new columns for each topic in topic_list using vectorized operations
for topic in topic_list:
    # Compute the topic estimation for each document
    df[topic.replace(' ', '_') + '_estimation'] = [topic_estimation(n, topic) for n in df.index]

# Multiply topic estimations with dictionary matches to get relative values
for topic in topic_list:
    column_name = topic.replace(' ', '_') + '_relative_value'
    df[column_name] = df[topic.replace(' ', '_') + '_estimation'] * df['dictionary_matches']

# Rename the topic estimation columns
df.rename(columns={topic: topic.replace(' ', '_') + '_estimation' for topic in topic_list}, inplace=True)

# Save the DataFrame to a new CSV file
df.to_csv('lpa_comment_data_demo.csv', index=False)

# READ RESULTS

### IMPORTANT VARIABLE

In [None]:
# TOPIC TO SEARCH
string = "luck"

# VARIABLE TO SEARCH
value = '_relative_value'
# value = '_estimation'

In [None]:
# Create a topic name by replacing spaces with underscores and appending the value
topic = string.replace(' ', '_') + value

# Sort the DataFrame based on the values in the specified topic column in descending order
df_sorted = df.sort_values(topic, ascending=False)

# Extract the 'comment' column of the first five rows
comments = df_sorted['comment'].head(5)

# Convert the comments to a list
comment_list = comments.tolist()

# Print each comment in the comment_list
for comment in comment_list:
    print(comment, '\n')

# Display the first five rows of the sorted DataFrame
df_sorted.head(5)


# TRIM DICTIONARY

In [None]:
from gensim import models

# Function to write a list of words to a file
def write_words_to_file(word_list, filename):
    with open(filename, 'a') as file:
        for word in word_list:
            file.write(word + '\n')

# Get the topic-word distribution matrix
topic_word_matrix = lda.get_topics()

# Get the vocabulary from the LDA model
vocab = lda.id2word

# Create a dictionary to store word counts
word_counts = {}

# Iterate over each topic
for topic_idx, topic_words in enumerate(topic_word_matrix):
    
    # Sort the word indices based on the word probabilities in descending order
    word_indices = topic_words.argsort()[::-1]
    
    # Iterate over the top 10 words for the topic
    for rank, word_idx in enumerate(word_indices[:10]):
        word = vocab[word_idx]
        word_prob = topic_words[word_idx]
        
        # Increment the count for the word in the word_counts dictionary
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
        
        # Print the rank, word, and probability for each word in the topic
        # print(f"   {rank + 1}. {word}: {word_prob:.4f}")

# Sort the word counts in descending order
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Initialize a list to store words that occur more than twice
words_to_remove = []

# Iterate over the sorted word counts
for word, count in sorted_word_counts:
    if count > 2:
        # Print the word and its count if it occurs more than twice
        print(f'{word}: {count}')
        words_to_remove.append(word)

# Write the words to be removed to a file
write_words_to_file(words_to_remove, 'words_to_remove.txt')

### ORIGINAL FUNCTION FOR CHECKING RESULTS