# IMPORT

In [None]:
# Import necessary modules
from requests import Request, Session
from requests.exceptions import ConnectionError, Timeout, TooManyRedirects
import json
import pprint 
import os

# Import progress bar module
from alive_progress import alive_bar

# Import time and sys modules
import time
import sys

# Import BoardGameGeek client
from boardgamegeek import BGGClient

# Create an instance of the BoardGameGeek client
bgg = BGGClient()

# EXTRACTION

In [None]:
# Retrieve the hot items in the 'boardgame' category from BoardGameGeek
hot_items = bgg.hot_items('boardgame')

# Create an empty dictionary to store items that encounter errors
miss = {}

# Create an empty list to store the extracted data
data = []

# Iterate over each hot item
for item in hot_items:
    try:
        # Retrieve the game details, including comments, for the current item
        game = bgg.game(game_id=item.id, comments=True)
        
        # Create a progress bar with the length of the comments
        with alive_bar(len(game.comments), force_tty=True) as bar:
            # Iterate over each comment in the game
            for comment in game.comments:
                # Create a dictionary to store the comment data
                com_data = {
                    "id": item.id,
                    "title": item.name,
                    "user": comment.commenter,
                    "comment": comment.comment,
                    "rating": comment.rating
                }
                
                # Append the comment data to the list
                data.append(com_data)
                
                # Pause for a short duration to simulate processing time
                time.sleep(0.01)
                
                # Update the progress bar
                bar()
    except:
        # If an error occurs, print 'error' and add the item to the 'miss' dictionary
        print('error')
        miss[item.id] = item.name

# Save the extracted data to a JSON file
with open("comment_data_demo.json", 'w') as f:
    json.dump(data, f, indent=2)  # indent=2 is not needed but makes the file human-readable if the data is nested   

# WRANGLING

In [None]:
import pandas as pd
import json
import os

# Set the display option to show all columns in pandas DataFrame
pd.set_option('display.max_columns', None)

# Read the JSON file containing the comment data
with open("comment_data_demo.json", 'r') as f:
    post_list = json.load(f)
    
# Print the number of comments before any formatting
print(f'Amount of comments before any formatting: {len(post_list)}')

# Convert the JSON data into a pandas DataFrame
df = pd.json_normalize(post_list)

# Set the path for the original data directory
path_original_data = r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'

# Save the DataFrame as a CSV file in the specified directory
df.to_csv(os.path.join(path_original_data, 'comment_data_demo.csv'), index=False)

# Display the first row of the DataFrame
df.head(1)

# CLEANING

## Reading raw data

In [None]:
import pandas as pd
import numpy as np
import json
import os

# Set the path for the original data directory
path_original_data = r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'

# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(path_original_data, 'comment_data_demo.csv'), low_memory=False)

df.head(1)

In [None]:
# Print some statistics of the 'comment' field

# Calculate the percentage of non-null comments
comment_percentage = round(df.comment.notnull().mean() * 100, 2)
print(str(comment_percentage) + '%')

# Calculate the maximum, minimum, and mean length of comments
max_length = df.comment.str.len().max()
min_length = df.comment.str.len().min()
mean_length = df.comment.str.len().mean()
print(max_length)
print(min_length)
print(mean_length)

# Search the number of comments containing searched words within the text of the message

pattern = "random"

# Count comments containing the search pattern
contains_pattern = df.comment.str.contains(pattern, na=False).sum()

# Count comments starting with the search pattern
starts_with_pattern = df.comment.str.startswith(pattern, na=False).sum()

# Count comments exactly matching the search pattern
exact_match_pattern = df.comment.str.fullmatch(pattern, na=False).sum()

print(contains_pattern)
print(starts_with_pattern)
print(exact_match_pattern)

# Filter out rows with null comments and reset the index
df = df[df.comment.notnull()]
df.reset_index(drop=True, inplace=True)

# Print the first messages that contain the pattern
matching_comments = df.loc[df.comment.str.contains(pattern, na=False), 'comment']
print(matching_comments)

## Filtering the data

In [None]:
from guess_language import guess_language
import enchant
import string
import re

# Function to check if a comment is in English
def is_english_batch(batch):
    # Create a batch of processed texts
    processed_texts = batch['comment'].str.lower().str.findall(r"[a-zA-Z0-9']+")

    # Create an English dictionary
    english_dictionary = enchant.Dict("en_US")

    # Check if any comment in the batch is in English
    is_english = processed_texts.apply(lambda text: sum(english_dictionary.check(word) for word in text) >= len(text) / 2)

    # Return a boolean Series indicating if each comment is in English
    return is_english

In [None]:
from IPython.display import display, HTML
from alive_progress import alive_bar
from tqdm import tqdm
import pandas as pd
import time
import sys

# Batch processing
batch_size = 1000  # Number of rows to process in each batch
num_rows = len(df)
result = pd.Series([], dtype='float64')  # Store the results

# Calculate the number of batches
num_batches = (num_rows // batch_size) + 1

# Initialize a progress bar
with tqdm(total=num_batches, ncols=num_batches) as pbar:
    # Process each batch
    for i in range(0, num_rows, batch_size):
        # Extract a batch of rows from the DataFrame
        batch = df.iloc[i:i+batch_size]
        
        # Filter out non-English rows in the batch
        batch_english = batch.loc[is_english_batch(batch)]
        
        # Concatenate the English rows to the result
        result = pd.concat([result, batch_english])
        
        # Update the progress bar
        pbar.update(1)

# Reset the index of the resulting DataFrame
result.reset_index(drop=True, inplace=True)

# Print the updated DataFrame
result.head(5)

In [None]:
import pandas as pd

# Assuming you have two DataFrames: df1 and df2 representing the two databases
df1 = df
df2 = result

# Print the size of each database
print('Current database:', len(result))
print('Original database:', len(df))
print('Difference:', len(df)-len(result))

# Find rows with differing 'comment' in df1 compared to df2
diff_df1 = df1[~df1['comment'].isin(df2['comment'])]

# Find rows with differing 'comment' in df2 compared to df1
diff_df2 = df2[~df2['comment'].isin(df1['comment'])]

# Concatenate the differing rows into a single DataFrame
diff_combined = pd.concat([diff_df1, diff_df2])

# Reset the index of the resulting DataFrame
diff_combined.reset_index(drop=True, inplace=True)

# Print the differences
diff_combined.head(5)

In [None]:
# Add a new column with the length of each comment
result['text_length'] = result['comment'].apply(lambda x: len(x))  

# Add a new column with the word count of each comment
result['word_count'] = result['comment'].apply(lambda x: len(x.split())) 

# Filter out rows with word count less than or equal to 5
result = result[result['word_count'] > 5]  

# Drop the first column (assumed to be unnecessary)
result = result.drop(result.columns[0], axis=1)  

# Print the first 5 rows of the resulting DataFrame
result.head(5)  

# Save the pre-processed DataFrame to a CSV file
result.to_csv('pre_processed_comment_data_demo.csv', index=False)  

# PREPROCESS

In [None]:
# Import necessary modules
import pandas as pd
import numpy as np
import json
import os

# Set the display option to show all columns in pandas DataFrame
pd.set_option('display.max_columns', None)

# Set the path to the original data directory
path_original_data = r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'

# Construct the file path to the CSV file
csv_file_path = os.path.join(path_original_data, 'pre_processed_comment_data_demo.csv')

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path, low_memory=False)

## Punctuation Removal

In [None]:
import string

# Function to remove punctuation from a text
def remove_punctuation(text):
    # Create a set of allowed characters (letters, numbers, and space)
    allowed_chars = set(string.ascii_letters + string.digits + ' ')
    
    # Remove punctuation characters not in the allowed set
    processed_text = ''.join(char for char in text if char in allowed_chars)
    
    return processed_text

# Apply the remove_punctuation() function to the 'comment' column and store the result in a new column 'processed_comment'
df['processed_comment'] = df['comment'].apply(remove_punctuation)

# Lower case all the messages
df['processed_comment'] = df['processed_comment'].str.lower()

## Tokenization

In [None]:
import re

# Function to tokenize a text
def tokenization(text):
    # Split the text on spaces to create tokens
    tokens = text.split()
    
    return tokens

# Apply the tokenization() function to the 'processed_comment' column and store the result in a new column 'comment_tokenized'
df['comment_tokenized'] = df['processed_comment'].apply(lambda x: tokenization(x))

## Stopword Removal

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

# Download the required NLTK resources (uncomment if needed)
# nltk.download('wordnet')
# nltk.download('omw-1.4')

np.random.seed(400)

# Initialize the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Function for lemmatization
def lemmatizer(text):
    # Lemmatize each word in the text
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    
    return lemm_text

# Apply the lemmatizer() function to the 'comment_key_words' column and store the result in a new column 'comment_lemmatized'
df['comment_lemmatized'] = df['comment_key_words'].apply(lambda x: lemmatizer(x))

## Stemming

## Lemmatization

## Gensim preprocessing

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
import json

np.random.seed(400)

# Download the required NLTK resource (uncomment if needed)
# nltk.download('wordnet')

reference_sheet = {}  # Dictionary to store word reference sheet

stemmer = SnowballStemmer("english")

# Function to lemmatize and stem a word
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize, lemmatize, and filter stopwords
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 2:
            word = lemmatize_stemming(token)
            if word in reference_sheet:
                if token not in reference_sheet[word]:
                    reference_sheet[word].append(token)
            else:
                reference_sheet[word] = [token]
            result.append(word)
    return result

# Tokenize, lemmatize, and filter verbs
def preprocess_verbs(text):
    text = gensim.utils.simple_preprocess(text)
    tagged_tokens = nltk.pos_tag(text)
    filtered_tokens = [token for token, pos_tag in tagged_tokens if pos_tag not in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']]
    
    result = []
    for token in filtered_tokens:
        if token not in STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result

print('Start.')
df['gensim_comment'] = df['comment'].apply(preprocess)
print('Next.')
df['gensim_comment_verbs'] = df['comment'].apply(preprocess_verbs)
print('Finish.')

# Save reference sheet as a JSON file
json_data = json.dumps(reference_sheet)
with open('reference_sheet.json', 'w') as file:
    file.write(json_data)

## Restructring the dataset

In [None]:
# Get the list of column names
columns = list(df.columns)
print(columns)

# Filter the DataFrame based on the length of 'gensim_comment' column
df = df[df['gensim_comment'].map(lambda d: len(d)) >= 5]
df = df.reset_index(drop=True)

# Calculate the average length of 'gensim_comment' column
average_length = df['gensim_comment'].apply(lambda x: len(x)).mean()
print(average_length)

# Save the DataFrame to a CSV file
df.to_csv('post_processed_comment_data_demo.csv', index=False)

# Display a sample of 5 rows from the DataFrame
df.sample(5)

In [None]:
import pandas as pd
import numpy as np
import json
import os

pd.set_option('display.max_columns', None)

# Set the maximum number of columns to display
pd.set_option('display.max_columns', None)

# Set the path to the original data file
path_original_data = r'C:\Users\Usuario\Documents\JupyterFolder\unimi_files\IR'

# Read the CSV file into a DataFrame
df = pd.read_csv(os.path.join(path_original_data, 'post_processed_comment_data_demo.csv'), low_memory=False)

# Display a sample of 10 rows from the DataFrame
df.sample(5)

In [None]:
# Check the number of posts that contain specific words
print(len(df[df.comment.str.contains('luck')]))
print(len(df[df.comment.str.contains('random')]))
print(len(df[df.comment.str.contains('boring')]))
print(len(df[df.comment.str.contains('complex')]))
print(len(df[df.comment.str.contains('complicated')]))
print(len(df[df.comment.str.contains('bookkeeping')]))

print(len(df[df.comment.str.contains('edition')]))
print(len(df[df.comment.str.contains('version')]))
print(len(df[df.comment.str.contains('expansion')]))

# Display a sample of 5 rows from the DataFrame that contain the word 'boring'
df[df.comment.str.contains('boring')].sample(5)