In [14]:
import os
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
import spacy
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import subprocess
import sys
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load('en_core_web_sm')
stop_words = set(stopwords.words('english'))
spell = SpellChecker()
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)
def process_query(query):
    query = query.lower()
    query = re.sub(r"[^\w\s-]", "", query)
    query = re.sub(r"[-]", " ", query)
    token = word_tokenize(query)
    tokens = [
        spell.correction(word)
        for word in token
        if word not in stop_words and spell.correction(word) is not None
    ]
    doc = nlp(' '.join(tokens))
    important_terms = [token.lemma_ for token in doc if token.pos_ in ['NOUN', 'PROPN', 'VERB']]
    expanded = set(important_terms)
    for word in important_terms:
        synonyms = get_synonyms(word)
        expanded.update(synonyms[:2])
    return list(expanded)
df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'queries.csv'))
df['processed'] = df['Query'].apply(process_query)
df['processed_string'] = df['processed'].apply(lambda tokens: ' '.join(tokens))
df.to_csv('processed_queries.csv', index=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
