In [32]:
import pandas as pd
import os
import json
import unicodedata
import re
from ast import literal_eval as string_to_list
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

queries = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'queries.csv'))
games = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'backloggd_games.csv'), index_col=0)
games['Summary'] = games['Summary'].fillna('')
games = games.drop_duplicates(subset='Title', ignore_index=True)
games[['Plays','Playing','Backlogs','Wishlist','Lists','Reviews']] = games[['Plays','Playing','Backlogs','Wishlist','Lists','Reviews']]     \
                                                                        .map(lambda x: float(x.replace('K','')) * 1000 if 'K' in x else float(x) * 1000)
games[['Developers','Platforms','Genres']] = games[['Developers','Platforms','Genres']].map(string_to_list)

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/sean/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sean/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sean/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def query_processing(query, expansion_terms, developers, platforms, genres):
    def query_normalisation(query):
        # 1. Lowercase
        query = query.lower()
        
        # 2. Remove accents
        query = unicodedata.normalize('NFKD', query)
        query = ''.join([c for c in query if not unicodedata.combining(c)])
        # query = unicodedata.normalize('NFKD', query).encode('ascii', 'ignore').decode('ascii')

        # 3. Remove punctuation
        query = query.replace('-', ' ')
        query = re.sub(f'[{re.escape(string.punctuation)}]', '', query)
        query = re.sub(r'\s+', ' ', query).strip()
        
        # 4. Remove stopwords and lemmatise
        tokens = word_tokenize(query)
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        lemmatiser = WordNetLemmatizer()
        lemmatised = [lemmatiser.lemmatize(word) for word in tokens]
        return ' '.join(lemmatised)

    def query_expansion(query, expansion_terms, synonym_expansion=True):
        if synonym_expansion is True:
            synonym_expansion = 2
        def get_synonyms(token):
            synonyms = set()
            for syn in wordnet.synsets(token):
                for lemma in syn.lemmas():
                    # WordNet uses underscores for multi-word synonyms
                    # e.g. air_conditioner -> air conditioner
                    synonym = lemma.name().replace('_', ' ')
                    if synonym.lower() != token.lower():
                        synonyms.add(synonym)
            return list(synonyms)

        # Expand the query with synonyms in domain-specific terms
        for term in expansion_terms:
            if term in query:
                loc = query.find(term)
                query = query[:loc] + expansion_terms[term] + ' ' + query[loc:]

        expanded_tokens = []
        for token in query.split(' '):
            expanded_tokens.append(token)
            if synonym_expansion:
                synonyms = get_synonyms(token)
                expanded_tokens.extend(synonyms[:synonym_expansion])

        # TODO: Do we necessarily want to remove duplicates?
        unique_terms = []
        for token in expanded_tokens:
            if token not in unique_terms:
                unique_terms.append(token)
            
        return ' '.join(unique_terms)

    def query_parsing(query, developers, platforms, genres):
        '''
        Parse the query to extract developers, platforms, genres, and years.
        '''
        def extract_years(query):
            # Regex pattern: matches 1980–1989, 1990–1999, 2000–2099
            pattern = r'\b(19[8-9]\d|20\d{2})\b'
            matches = re.findall(pattern, query)
            return [int(year) for year in matches]
        
        query_years = extract_years(query)
        query_developers = [developer for developer in developers if developer in query.split(' ')]
        query_platforms = [platform for platform in platforms if platform in query.split(' ')]
        query_genres = [genre for genre in genres if genre in query.split(' ')]
        return {'Developers':query_developers, 'Platforms':query_platforms, 'Genres':query_genres, 'Years':query_years}

    normalised_expanded = query_expansion(query_normalisation(query), expansion_terms, synonym_expansion=False)
    parsed = query_parsing(normalised_expanded, developers, platforms, genres)
    output = {'Original':query, 'Processed':normalised_expanded, 'Developers':parsed['Developers'], 'Platforms':parsed['Platforms'], 'Genres':parsed['Genres']}
    return output

In [34]:
developer_set, platform_set, genre_set = set(), set(), set()
for developers, platforms, genres in zip(games['Developers'], games['Platforms'], games['Genres']):
    developer_set.update(set(developers)), platform_set.update(set(platforms)), genre_set.update(set(genres))
developer_set, platform_set, genre_set = set(word.lower() for word in developer_set), set(word.lower() for word in platform_set), set(word.lower() for word in genre_set)

with open('expansion_terms.json', 'r') as json_file:
    expansion_terms = json.load(json_file)

processed_queries = [query_processing(query, expansion_terms, developer_set, platform_set, genre_set) for query in queries['Query']]
processed_queries = pd.DataFrame(processed_queries)
processed_queries.to_csv('processed_queries.csv', index=False)

In [35]:
# import os
# import re
# import pandas as pd
# import nltk
# from nltk.corpus import stopwords, wordnet
# from nltk.tokenize import word_tokenize
# from spellchecker import SpellChecker
# import spacy
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# import subprocess
# import sys
# try:
#     nlp = spacy.load('en_core_web_sm')
# except OSError:
#     subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
#     nlp = spacy.load('en_core_web_sm')
# stop_words = set(stopwords.words('english'))
# spell = SpellChecker()
# def get_synonyms(word):
#     synonyms = set()
#     for syn in wordnet.synsets(word):
#         for lemma in syn.lemmas():
#             synonyms.add(lemma.name().replace('_', ' '))
#     return list(synonyms)
# def process_query(query):
#     query = query.lower()
#     query = re.sub(r"[^\w\s-]", "", query)
#     query = re.sub(r"[-]", " ", query)
#     token = word_tokenize(query)
#     tokens = [
#         spell.correction(word)
#         for word in token
#         if word not in stop_words and spell.correction(word) is not None
#     ]
#     doc = nlp(' '.join(tokens))
#     important_terms = [token.lemma_ for token in doc if token.pos_ in ['NOUN', 'PROPN', 'VERB']]
#     expanded = set(important_terms)
#     for word in important_terms:
#         synonyms = get_synonyms(word)
#         expanded.update(synonyms[:2])
#     return list(expanded)
# df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'queries.csv'))
# df['processed'] = df['Query'].apply(process_query)
# df['processed_string'] = df['processed'].apply(lambda tokens: ' '.join(tokens))
# df.to_csv('processed_queries.csv', index=False)