In [1]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents_topics.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = docs_raw

In [6]:
documents[0]

{'topic': 'time, inequality, fear, gender, analysis, health, year, grant, experience, experiment, expertise, family, fellowship, help, group',
 'text': 'I am writing to express my interest in the advertised post of Medical/Grant Writer. I completed my PhD in Quantitative Sociology at the University of British Columbia in 2017. Since then, I have done research at universities across the globe, including the University of Oxford and York University. I am currently an assistant professor at Western University. My research focuses on time use, social inequality, aging, and the fear of falling.'}

# Elasticsearch

don't forget to run docker:

docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

In [7]:
from elasticsearch import Elasticsearch

In [8]:
es_client = Elasticsearch('http://localhost:9200') 

In [9]:
es_client.info()

ObjectApiResponse({'name': 'f0cf17e51237', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'w45bI79kQvyIEfvaNrR7RA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [10]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "topic": {"type": "text"}
        }
    }
}

index_name = "cl-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'cl-questions'})

In [11]:
documents[0]

{'topic': 'time, inequality, fear, gender, analysis, health, year, grant, experience, experiment, expertise, family, fellowship, help, group',
 'text': 'I am writing to express my interest in the advertised post of Medical/Grant Writer. I completed my PhD in Quantitative Sociology at the University of British Columbia in 2017. Since then, I have done research at universities across the globe, including the University of Oxford and York University. I am currently an assistant professor at Western University. My research focuses on time use, social inequality, aging, and the fear of falling.'}

In [12]:
from tqdm.auto import tqdm

In [13]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/141 [00:00<?, ?it/s]

In [14]:
query = 'Strong understanding and experience working on issues related to anti-racism, equity, diversity, and inclusion concepts and principles'

In [15]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["topic", "text^3"],
                        "type": "best_fields"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [16]:
from openai import OpenAI

In [17]:
client = OpenAI()

In [18]:
def search(query):
    boost = {'topic': 0.5, 'text': 3.0}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5
    )

    return results

In [19]:
def build_prompt(query, search_results):
    prompt_template = """
You're a job applicant with a PhD. Write a paragraph addressing the CRITERION based on the CONTEXT from the database.
Use only the facts from the CONTEXT when answering the QUESTION. Do not add additional information.

CRITERION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"topic: {doc['topic']}\ntext: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [20]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [21]:
search_results = elastic_search(query)

In [22]:
search_results

[{'topic': 'organization, issue, proposal, expertise, fellowship, gender, project, output, experience, student, housework, help, health, group, equality',
  'text': 'I have extensive experience working on cutting-edge projects in demography, sociology, and medical sociology, where I have researched issues such as sexism, ageism, and racism. These societal concerns are often reflected in the selection criteria of many granting organizations. For example, Tri-Council fellowships require applicants to clearly demonstrate how their research addresses gender-related issues and inequities. As a sociologist, I am trained to align proposals with these priorities. My expertise and experience make me well-suited to guide graduate students in adapting their proposals to meet the requirements of granting organizations.'},
 {'topic': 'organization, issue, proposal, expertise, fellowship, gender, project, output, experience, student, housework, help, health, group, equality',
  'text': 'I have exten

In [23]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [24]:
rag(query)

'My experience and expertise in researching societal issues has equipped me with a deep understanding of concepts and principles related to anti-racism, equity, diversity, and inclusion. Working on projects in demography, sociology, and medical sociology, I have explored relatively complex issues such as sexism, ageism, and racism. As a sociologist, I have navigated these subjects by aligning my proposals with the priorities of different granting organizations focused on gender-related problems and inequalities. I have also served on the Self-Assessment Team at Oxford University, which aids in preparing the Sociology Department’s application for the prestigious Athena SWAN bronze award in gender equality, showing my dedication to diversity and inclusion initiatives. Furthermore, I have partaken in administrative activities rooted in advancing diversity and inclusion, attesting to my robust understanding of these important principles.'

In [25]:
rag('Previous experience working with survey data, including demographic questions, experiential questions, and analyzing qualitative data.')

'Based on my previous research expertise, I have amassed a wealth of experience in working with complex survey data that spans across demographic and experiential questions among others. For instance, in my study of global societies, I have extensively dealt with prominent national-level data such as the Canadian General Social Survey, American Time Use Survey, and Japanese Survey on Time Use and Leisure Activities, to name but a few. I also have experience with multinational surveys including the International Social Survey Programme, Multinational Time Use Study (MTUS), and Survey of Health, Ageing, and Retirement in Europe. Further, within these surveys, I have harmonized datasets from different cultural contexts, thus gaining significant experience in analyzing qualitative data. Coupled with my skills in programming languages such as R and Python, I have developed a number of data visualization packages that ease analysis and visualization of data, especially in the realm of time-u

In [26]:
rag('Understanding the importance of data visualization.')

"As a holder of a doctorate in quantitative sociology from the University of British Columbia and currently a postdoctoral fellow at York University, I am well-versed in the importance of data visualization in research and academia. I've utilized my expertise in data analysis and visualization to develop online visualization tools for various projects. These tools, such as the one I created for ATUS-X caregivers’ data, effectively translate complex data for academics working with time use data. When I noticed the usage struggles of the original PHP+SQL-based tool, I transformed it into more accessible R packages. My portfolio showcases select visualizations and underlines my consistent endeavor to generate user-friendly, interactive data visualizations to enhance the understanding of the data. I am confident that my extensive experience and knowledge in creating effective visualizations will be a valuable asset."

In [29]:
rag('Proficient in MS Office Suite e.g. Word, Excel, PowerPoint, etc. and statistical analysis software e.g. SPSS, SAS, Stata, R.')

'In alignment with your criterion, I offer extensive skills in MS Office Suite, with particular proficiency in Excel, essential for managing and analyzing data. Moreover, my proficiency extends to key statistical software, including Stata and SPSS, enabling robust data analysis and research. My competency in such tools is reflected in numerous projects, as documented on my GitHub pages. Additionally, I am experienced in using programming languages such as R and Python, further expanding my capabilities in data management and statistical analysis. With my technical proficiencies and vast experience, I am well equipped to bring a significant contribution to your team, particularly in handling large datasets, overseeing data analysis, and delivering data-driven insights.'

In [32]:
rag('Comfortable working in diverse cultural contexts, and living in different countries.')

'Throughout my academic and research career, I have shown a deep commitment to working in diverse cultural contexts. This interest is evidenced by my research and analysis of data sources from across North America, Europe, Central Asia and East Asia. Over the years, I have acquired a unique proficiency working with national and multinational datasets from these regions. This has not only reinforced my adaptability and versatility while working with diverse cultures but also familiarized me with the methodological nuances of dealing with multiple data sources. In addition, my broad-based research on gender inequality and unpaid work across different socio-cultural contexts has been published in leading journals, a further testament of my versatility and comfort in diverse settings. My experiences ensure that I am well equipped to work and live in varying cultural contexts which in turn fosters my commitment towards enriching the roles I undertake.'

In [34]:
rag('Demonstrated quantitative and qualitative analytical skills')

'I have indeed demonstrated my quantitative and qualitative analytical skills throughout my academic and research career. My proficiency in both quantitative and qualitative research was honed from my tenure at Oxford where I used state-of-the-art quantitative methodologies including sequence analysis and multi-level mixed-effect models to analyze and investigate the variations in time-use among women caregivers and older adults. My expertise extends to a diverse range of research interests, machine learning included, which I plan to utilize to its full extent for further growth and development of this project, especially given the large complex legacy data. Recently, I led workshops on sequence analysis at Oxford, teaching early career researchers, showcasing my ability to impart knowledge as well as learn from it. I also have a robust training in statistics and am able to use a variety of programming languages and analysis tools, which I have employed in my research and communication