In [11]:
demo = False

# READING DATA

In [15]:
import pandas as pd
import json
import os
import numpy as np
import ast

pd.set_option('display.max_columns', None)

# Function to find unique elements in two lists
def find_unique_elements(list1, list2):
    unique_elements_list1 = list(set(list1) - set(list2))
    unique_elements_list2 = list(set(list2) - set(list1))
    unique_elements = unique_elements_list1 + unique_elements_list2
    return unique_elements

# Function to write words to a file
def write_words_to_file(word_list, filename):
    with open(filename, 'w') as file:
        for word in word_list:
            file.write(word + '\n')

# Function to read words from a file
def read_words_from_file(filename):
    word_list = []
    with open(filename, 'r') as file:
        for line in file:
            word = line.strip()
            word_list.append(word)
    return word_list

if demo:
    # Set the path for the original data directory
    path_original_data = r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR\files_csv\version_demo'
    # Read the CSV file into a DataFrame
    df = pd.read_csv(os.path.join(path_original_data, 'post_processed_comment_data_demo.csv'), low_memory=False)
    print('Using demo...')
else:
    path_original_data = r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR\files_csv'
    df = pd.read_csv(os.path.join(path_original_data, 'post_processed_comment_data.csv'), low_memory=False)
    
# print(len(df))    

# Convert string representation of lists to actual lists
string_list = df['gensim_comment_verbs'].tolist()
comments_list = [ast.literal_eval(s) for s in string_list]

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from gensim import corpora

# Create a dictionary from the comments list
auxiliar = gensim.corpora.Dictionary(comments_list)
print('Corpus size: ', len(auxiliar))
list_a = [token for token, idx in auxiliar.token2id.items()]

# Filter the dictionary based on word frequencies
# Remove words that appear in less than 5% of the comments or more than 99% of the comments
auxiliar.filter_extremes(no_below=len(auxiliar) * 0.05, no_above=0.99, keep_n=None)
print('Filtered corpus size: ', len(auxiliar))
list_b = [token for token, idx in auxiliar.token2id.items()]

lemmatizer = WordNetLemmatizer()
verbs = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

tokens = [token for token in auxiliar.values()]
pos_tags = nltk.pos_tag(tokens)

# Filter out tokens that are verbs
filtered_tokens = [token for token, pos_tag in zip(tokens, pos_tags) if pos_tag[1] not in verbs]
filtered_dictionary = corpora.Dictionary()
filtered_dictionary.doc2bow(filtered_tokens, allow_update=True)

print('Corpus without verbs size: ', len(filtered_dictionary), '\n')
list_c = [token for token, idx in filtered_dictionary.token2id.items()]

filename = 'words_to_remove.txt'

# Read the list of words to remove from a file
words_to_remove = read_words_from_file(filename)

print(words_to_remove, '\n')
word_ids = [filtered_dictionary.token2id[word] for word in words_to_remove if word in filtered_dictionary.token2id]

# Filter the dictionary by removing specific words
filtered_dictionary.filter_tokens(bad_ids=word_ids)
filtered_dictionary.compactify()

print('Revised corpus size: ', len(filtered_dictionary))

dictionary = filtered_dictionary

Corpus size:  23791
Filtered corpus size:  94
Corpus without verbs size:  88 

['game', 'player', 'play', 'card', 'like', 'fun', 'time', 'mechan', 'great', 'enjoy', 'good', 'theme'] 

Revised corpus size:  77


# GETTING IDEAS

# SPECIFYING TOPICS

In [16]:
import re
from nltk.corpus import wordnet

def generate_synonyms(word_list, num_synonyms, full_return=False, verbose=False):
    """
    Generate synonyms for a given list of words using WordNet.

    Args:
        word_list (list): List of words.
        num_synonyms (int): Number of synonyms to generate per word.
        full_return (bool, optional): Whether to return all synonyms or a subset based on 'num_synonyms'. Defaults to False.
        verbose (bool, optional): Whether to print the generated synonyms. Defaults to False.

    Returns:
        list: List of generated synonyms.
    """
    synonyms = []
    
    # Iterate over each word in the word_list
    for word in word_list:
        # Get the synsets (sets of synonymous words) for the current word
        synsets = wordnet.synsets(word)
        
        # Iterate over each synset
        for synset in synsets:
            # Extend the synonyms list with the lemma names of the synset
            synonyms.extend(synset.lemma_names())
    
    # Remove duplicates from the synonyms list
    synonyms = list(set(synonyms))
    
    if verbose:
        print(f"Generated synonyms: {synonyms}\n")
    
    if full_return:
        return synonyms
    else:
        # Return a subset of the synonyms list based on the 'num_synonyms' argument
        # If 'num_synonyms' is greater than the total number of synonyms, return all synonyms
        return synonyms[:num_synonyms] if num_synonyms < len(synonyms) else synonyms

### IMPORTANT VARIABLE

In [17]:
# NUMBER OF SYNONYMS GENERATED
num_synonyms = 10    

# In this version is useless since I use the full list.

In [18]:
# Define target topics dictionary
target_topics = {}

# BOOKEEPING

# Actions
string1 = "adress, bookeeping, tracking, writing, documenting, recording, organizing, remembering, looking, reading, understanding, calculate"
# Components
string2= "rule set, rulebook, handbook, guidebook, guide, reference, knowledge, text, information, data, notes, progress"
# Negative connotations
string3 = "excesive rules, vague, tedious, confusing, slow, time-consuming, methodical, difficult, hard, long, endless"
# Positive connotations
string4 = "simple, short, easy, understandable, helpful, organized, quick setup, remember, little, few, illustarted"
# Other related terms
string5 = "math, mathematics"

string_list = [string1, string2, string3, string4, string5]
synonym_list = []

# Process each string separately
for string in string_list:
    tokens = re.findall(r'\b\w+\b', string)
    synonyms = generate_synonyms(tokens, num_synonyms, full_return=True)
    synonym_list.extend(synonyms + tokens)
    
target_topics["bookeeping"] = synonym_list    

# INTERACTION

# Actions
string1 = "solo, interaction, interact, roleplay, discuss, talk, influence, defeat, lose, win, work together, debate, act"
# Components
string2 = "solo, interaction, interactivity, team, group, friends, family, wife, number, npc, role, participants, actions, solo, coop, action, turn"
# Negative connotations
string3 = "solo, interaction, conflict, discuss, clash, face, defeat, lose, fight, argue, angry, bored"
# Positive connotations
string4 = "solo, interaction, cooperation, teamwork, interactive, engaging, friendly, bonding, interesting"
# Other related terms
string5 = "interaction, solo, more, less, players, more people, more players, more gamers"

string_list = [string1, string2, string3, string4, string5]
synonym_list = []

# Process each string separately
for string in string_list:
    tokens = re.findall(r'\b\w+\b', string)
    synonyms = generate_synonyms(tokens, num_synonyms, full_return=True)
    synonym_list.extend(synonyms + tokens)
    
target_topics["interaction"] = synonym_list

# COMPLEX

# Actions
string1 = "master, learn, teach, solve, help, understand, replay, enjoy, begin, start, introduce, simplify, improve, predict"
# Components
string2 = "tricks, variables, ability, skill, challenge, depth, strategy, tactics, problems , puzzles, consecuences, repercussions, replayability, modularity, complexity"
# Negative connotations
string3 = "difficult, challenging, simple, complex, hard, demanding, hardcore"
# Positive connotations
string4 = "experience, easy, complex, simple, excellent, unpredictable, replayable, helpful, deep, rich, style,  acessible, competitive"
# Other related terms
string5 = "veteran, nobie, rookie, learning curve, skill level"

string_list = [string1, string2, string3, string4, string5]
synonym_list = []

# Process each string separately
for string in string_list:
    tokens = re.findall(r'\b\w+\b', string)
    synonyms = generate_synonyms(tokens, num_synonyms, full_return=True)
    synonym_list.extend(synonyms + tokens)

target_topics["complex"] = synonym_list

# COMPLICATED

# Actions
string1 = "complicate, repeat, learn, teach, explain, react, forget, forgive"
# Components
string2 = "many, complication, rules, exeptions, reaction, time, predictable results"
# Negative connotations
string3 = "predictable, hard, easy, boring, daunting, overwhelming, long, endless, repetitive, convoluted"
# Positive connotations
string4 = "easy, quick, forgiving, predictable"
# Other related terms
string5 = "casual, begginers, noobs, no, negative, ease, clutter"

string_list = [string1, string2, string3, string4, string5]
synonym_list = []

# Process each string separately
for string in string_list:
    tokens = re.findall(r'\b\w+\b', string)
    synonyms = generate_synonyms(tokens, num_synonyms, full_return=True)
    synonym_list.extend(synonyms + tokens)

target_topics["complicated"] = synonym_list

# DOWNTIME

# Actions
string1 = "time, relax, think, plan ahead, wait, waste time, do, choose, decide, interact, bore, hope, speed"
# Components
string2 = "time, downtime, free time, waiting period, turns, something, nothing, interactive, in-between, between"
# Negative connotations
string3 = "time, unproductive, long, slow, boring, uninteresting, limit"
# Positive connotations
string4 = "time, fast, quick, engaging, pace, decisions, options, limitless"
# Other related terms
string5 = "time, in-character, individual"

string_list = [string1, string2, string3, string4, string5]
synonym_list = []

# Process each string separately
for string in string_list:
    tokens = re.findall(r'\b\w+\b', string)
    synonyms = generate_synonyms(tokens, num_synonyms, full_return=True)
    synonym_list.extend(synonyms + tokens)

target_topics["downtime"] = synonym_list

# BASH THE LEADER

# Actions
string1 = "catch up, finish, bash, dethrone, lead, rule, fix, win, defeat, overtake, resign, benefit, sacrifice, prevent victory, fight, unite, end, against, curb"
# Components
string2 = "end, endgame, leader, champion, winner, looser, rest, power, gap, difference, advantage, victory, defeat, opprtunity, actions"
# Negative connotations
string3 = "deterministic, fixed, decided, univitable, detriment, disadvatage, losers, meaningless, useless"
# Positive connotations
string4 = "possible, hope, underdog, suprise, turntable, martyr, decision, revolution, winners"
# Other related terms
string5 = "first, second, third, last"

string_list = [string1, string2, string3, string4, string5]
synonym_list = []

# Process each string separately
for string in string_list:
    tokens = re.findall(r'\b\w+\b', string)
    synonyms = generate_synonyms(tokens, num_synonyms, full_return=True)
    synonym_list.extend(synonyms + tokens)
    
target_topics["bash the leader"] = synonym_list

# LUCK

# Actions
string1 = "bet, roll dice, gambler, predict, try, intervice, influence, control, result, draw cards"
# Components
string2 = "luck, alea, randomess, uncertainty, unpredictability, outcome, possibilities, opprtunities, probability, chances, intervention"
# Negative connotations
string3 = "unlucky, random, uncontrollable, unpredictable, impossible, jynx, unwinnable, unprobable, unlikely"
# Positive connotations
string4 = "lucky, random, possible, predictable, controlable, winnable, probable, likely"
# Other related terms
string5 = "chances, win, lose, dice, card, deck, skill, control, mastery, practice, expert"

string_list = [string1, string2, string3, string4, string5]
synonym_list = []

# Process each string separately
for string in string_list:
    tokens = re.findall(r'\b\w+\b', string)
    synonyms = generate_synonyms(tokens, num_synonyms, full_return=True)
    synonym_list.extend(synonyms + tokens)

target_topics["luck"] = synonym_list

# Contraries of every topic

string1 = ""
string2 = "lawless, chaotic, slow, math, mathematics"
string3 = "poor"

# Actions
string1 = ""
# Components
string2 = "performance, reviews"
# Negative connotations
string3 = "lawless, poor, chaotic, negative, dislike"
# Positive connotations
string4 = "lawful, rich, ordered, possitive, like"
# Other related terms
string5 = ""

string_list = [string1, string2, string3, string4, string5]
synonym_list = []

# Process each string separately
for string in string_list:
    tokens = re.findall(r'\b\w+\b', string)
    synonyms = generate_synonyms(tokens, num_synonyms, full_return=True)
    synonym_list.extend(synonyms + tokens)

target_topics["complements"] = synonym_list

# COMPLEMENTING WITH CORPUS

In [19]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import numpy as np

np.random.seed(400)

# Initialize stemmer and lemmatizer
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

# Function to lemmatize and stem text
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Function to preprocess text
def preprocess(text):
    return [lemmatize_stemming(token) for token in simple_preprocess(text) if token not in STOPWORDS and len(token) > 2]

# Create merged_list with replaced underscores
merged_list = [item.replace("_", " ") for sublist in target_topics.values() for item in sublist]

# Initialize variables
processed_docs = []
reference_sheet = {}
topic_reference_sheet = {}

# Create new_list by splitting strings with multiple words
new_list = [word for string in merged_list for word in string.split() if len(string.split()) > 1]

# print('*', len(merged_list), len(new_list))        

# Process each document
for doc in merged_list:
    processed_words = preprocess(doc)
    processed_docs.append(processed_words)
    
    # Create reference_sheet and topic_reference_sheet
    for word in processed_words:
        reference_sheet.setdefault(word, []).append(doc)
        for key, values in target_topics.items():
            if doc in values:
                topic_reference_sheet.setdefault(word, []).append(key)

# Simplify the creation of reference_sheet and topic_reference_sheet
reference_sheet = {key: values for key, values in reference_sheet.items()}     
topic_reference_sheet = {key: values for key, values in topic_reference_sheet.items()}   

# print(len(processed_docs))
# print(len(reference_sheet))
# print(len(topic_reference_sheet))

# TRIMMING DICTIONARY

In [20]:
# Read words from a file and return them as a list
def read_words_from_file(filename):
    with open(filename, 'r') as file:
        word_list = [line.strip() for line in file]
    return word_list

print('Dictionary from corpus length:', len(dictionary))
topic_dictionary = gensim.corpora.Dictionary(processed_docs)
print('Dictionary from code length:', len(topic_dictionary), '\n')

# Merge the dictionaries into a single dictionary
merged_dict = gensim.corpora.Dictionary()
merged_dict.merge_with(dictionary)
merged_dict.merge_with(topic_dictionary)

filename = 'words_to_remove.txt'
words_to_remove = read_words_from_file(filename)

print(words_to_remove, '\n')
print('Merged dictionary length:', len(merged_dict))

# Filter and compactify the merged dictionary
word_ids = [merged_dict.token2id[word] for word in words_to_remove if word in merged_dict.token2id]
merged_dict.filter_tokens(bad_ids=word_ids)
merged_dict.compactify()

print('Trimmed dictionary length:', len(merged_dict))

Dictionary from corpus length: 77
Dictionary from code length: 1904 

['game', 'player', 'play', 'card', 'like', 'fun', 'time', 'mechan', 'great', 'enjoy', 'good', 'theme'] 

Merged dictionary length: 1935
Trimmed dictionary length: 1925


# CREATING DICTIONARY

In [21]:
# Save the merged dictionary to a file
merged_dict.save('dictionary')

import json

# Convert the reference_sheet dictionary to JSON format
json_data = json.dumps(reference_sheet)

# Write the JSON data to reference_sheet.json file
with open('reference_sheet.json', 'w') as file:
    file.write(json_data)

# Convert the topic_reference_sheet dictionary to JSON format
json_data = json.dumps(topic_reference_sheet)

# Write the JSON data to topic_reference_sheet.json file
with open('topic_reference_sheet.json', 'w') as file:
    file.write(json_data)