In [2]:
import pandas as pd
import re
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
df = pd.read_csv("input/qa-pairs.csv", encoding="latin-1", dtype="str")
df.head()

Unnamed: 0,question,answer,partners_personnel_selected
0,Checking the financial stability of our suppli...,Our fiscal year begins in January and ends in ...,0
1,How many candidates do you have in your database?,"As of today, 4/25/2023, we have the following:...",0
2,How do you select which candidates are placed ...,Our specialized role known as the client speci...,0
3,How often do you review and refresh your datab...,Our sourcing platform is updated throughout th...,0
4,What type of background and drug testing do yo...,AccuSource is our service provider for backgro...,0


In [3]:
non_alphanumeric = set()
pattern = r"[^a-zA-Z0-9\s]"

# get all non-alphanumeric characters in the questions and answers
for index, row in df.iterrows():
    question = row["question"]
    answer = str(row["answer"])
    non_alphanumeric.update(set(re.findall(pattern, question)))
    non_alphanumeric.update(set(re.findall(pattern, answer)))

In [4]:
non_alphanumeric

{'"',
 '%',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '/',
 ':',
 '>',
 '?',
 '¥',
 'É',
 'Ð',
 'Ñ',
 'Ò',
 'Ó',
 'Õ'}

In [63]:
mapping = {"¥": "-", "É": "...", "Ð": "—", "Ñ": "—", "Ò": '"', "Ó": '"', "Õ": "'"}


def replace_non_alphanumeric(text):
    text = str(text)
    for key, value in mapping.items():
        text = text.replace(key, value)
    return text


df["question"] = df["question"].apply(replace_non_alphanumeric)
df["answer"] = df["answer"].apply(replace_non_alphanumeric)

In [56]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

ents = []
stop_words = set(stopwords.words('english'))
stop_list = [",", "?", "\n", "/", ".", "(", ")", "&", "-", "\t\t", " "]
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(stop_list)

for i, row in df.iterrows():
    question = row["question"]
    doc = nlp(question)
    lemmas = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in stpwrd]
    ents.append([ent.lemma_.lower() for ent in doc.ents])
    print(lemmas)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hoon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['check', 'financial', 'stability', 'supplier', 'part', 'due', 'diligence', 'process', 'support', 'please', 'tell', 'company', 'financial', 'year', 'run']
['many', 'candidate', 'database']
['select', 'candidate', 'place', 'instance', 'two', 'buyer', 'look', 'candidate', 'ultimately', 'determine', 'company', 'place', 'employee']
['often', 'review', 'refresh', 'database', 'candidate']
['type', 'background', 'drug', 'testing', 'conduct', 'extensive', 'service', 'provider', 'background', 'checking']
['question', 'use', 'interview', 'process', 'please', 'provide', 'several', 'example', 'elaborate', 'evaluation', 'technique', 'may', 'michaels', 'submit', 'question', 'addition']
['please', 'advise', 'michaels', 'utilize', 'hiring', 'onboarding', 'process', 'example', 'provide', 'approval', 'onboarde', 'training']
['member', 'american', 'staffing', 'association', 'certification', 'hold']
['determine', 'employee', 'need', 'coverage', 'aca']
['diversity', 'business', 'certification', 'eg', 'fema

In [62]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

ents = []
stop_words = set(stopwords.words('english'))
stop_list = [",", "?", "\n", "/", ".", "(", ")", "&", "-", "\t\t", " "]
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(stop_list)

for i, row in df.iterrows():
    answer = row["answer"]
    doc = nlp(answer)
    lemmas = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in stpwrd]
    print(lemmas)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hoon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['fiscal', 'year', 'begin', 'january', 'end', 'december', 'contain', 'four', '5', 'week', 'period', 'eight', '4', 'week', 'period', 'week', 'end', 'sunday']
['today', '4/25/2023', 'following', ':', '2060', 'associate', 'successfully', 'complete', 'assignment', 'available', 'work', '3464', 'associate', 'onboarde', 'within', 'last', '30', 'day', 'available', 'work', '4117', 'associate', 'go', 'application', 'process', 'total', '57083', 'associate', 'active', 'unassigned', 'system']
['specialized', 'role', 'know', 'client', 'specialist', 'dedicated', 'resource', 'customer', 'primary', 'role', 'intake', 'order', 'evaluate', 'priority', 'review', 'candidate', 'customer', 'opportunity', 'present', 'instance', 'compete', 'order', 'candidate', 'would', 'vet', 'role', 'ensure', 'standard', 'customer', 'willing', 'take', 'position', 'ultimately', 'client', 'specialist', 'work', 'find', 'right', 'person', 'job']
['source', 'platform', 'update', 'throughout', 'day', 'allow', 'team', 'view', 'assoc