In [None]:
from pathlib import Path
import re
from time import time

from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd


In [None]:
# Read in the data
df = pd.read_excel(Path.cwd().parent / "export-2022-01-06-11-48-21.xlsx")


In [None]:
# Split into three sets of columns - demographics, question answers, irrelevancies
question_indexes = sorted(
    [i for i, q in enumerate(df.columns) if "question" in q.lower()]
)
first, last = min(question_indexes), max(question_indexes)

demographics = df.iloc[:, :first]
questions = df.iloc[:, first : last + 1]
not_needed = df.iloc[:, last + 1 :]


In [None]:
# There are 66 columns in the questions section, but 3 are followups to categorical questions
print(len(questions.columns))
followups = [
    q for q in questions.columns if q.lower().endswith("please explain your answer:")
]
print(len(followups))


In [None]:
# Work out which questions we could treat as categorical yes/no if we wanted
# There are 21 (after removing one that's in 2 parts)
binary_starts = ("do", "should", "could", "are there", "would", "is more", "have we")
binary_questions = [q for q in questions.columns if q.lower().startswith(binary_starts)]
print(len(binary_questions))


In [None]:
# Quick test run of setting up a corpus and building word2vec on it
texts = []
for col in questions.columns:
    texts.extend(questions[col][questions[col].notnull()].tolist())


In [None]:
# Minimal preprocessing as a starting point
def preprocess(text: str) -> str:
    # This removes punctuation, though arguably that's not a great idea
    return [w.lower() for w in word_tokenize(str(text)) if w.isalpha()]


In [None]:
# Build doc2vec
processed = list(map(preprocess, texts))
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(processed)]

start = time()
model = Doc2Vec(documents, vector_size=100, window=4, min_count=2, seed=100)
print(time() - start)
print(f"Vocab size = {len(model.wv)}")


In [None]:
# Do a similarity search. But results pretty crummy.
# Partly minute amount of data, but doesn't even find docs that contain the search term
def search(query, model, documents, topn=5, print_results=True):
    processed_query = preprocess(query)
    vectorised_query = model.infer_vector(processed_query)
    closest_matches = model.dv.most_similar([vectorised_query], topn=topn)
    results = [
        {"key": match[0], "similarity": match[1], "text": documents[match[0]][0]}
        for match in closest_matches
    ]

    if print_results:
        print(f"Search query: '{query}'")
        for i, r in enumerate(results):
            print(f"Rank {i}, similarity {r['similarity']}")
            print(f"Document: {' '.join(r['text'])}", end="\n\n")

    return results


_ = search("Domestic abuse", model, documents)


In [None]:
# What if I do sentences instead of docs?
sentences = []
for col in questions.columns:
    answers = questions[col][questions[col].notnull()].tolist()
    for answer in answers:
        sentences.extend(sent_tokenize(str(answer)))

# Build doc2vec
processed = list(map(preprocess, sentences))
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(processed)]

start = time()
model = Doc2Vec(documents, vector_size=100, window=4, min_count=2, seed=100)
print(time() - start)
print(f"Vocab size = {len(model.wv)}")


In [None]:
# Search again. Results are the same - people write in very long sentences!
_ = search("Domestic abuse", model, documents)


In [None]:
# Should do the sanity check code from the Gensim tutorials
# This checks each doc is most similar to itself
