In [2]:
import pandas as pd
import spacy
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import defaultdict
from itertools import chain

In [3]:
# Load data
processed_queries = pd.read_csv('data\\processed_queries.csv')
processed_orders = pd.read_csv('data\\processed_orders.csv')

In [4]:
processed_queries.head()

Unnamed: 0,title,query,named_entities,summary,chunks
0,Landlord said he's moving in on Friday,My girlfriend has been renting an apartment wi...,"[Landlord, Friday, 1 month, This morning, 175,...",My girlfriend has been renting an apartment wi...,"[Landlord, he, Friday, My girlfriend, an apart..."
1,Advice and vent on landlord invading privacy,"Yesterday, my missus and I were freshly post c...","[Yesterday, about 3pm, n’t]","Yesterday, my missus and I were freshly post c...","[Advice, vent, landlord invading privacy, my m..."
2,Landlord keeps coming onto property,We're in a rental we're there is a locked shed...,"[Landlord, English, 3 years]",We're in a rental we're there is a locked shed...,"[Landlord, property, We, a rental, we, a locke..."
3,Landlords have put a local candidate election ...,As per title I'm not particularly happy about ...,[],I'm not particularly happy about having my lan...,"[Landlords, a local candidate election sign, m..."
4,Can a Landlord Complain About Beds Not Being M...,I feel a little harassed. Should we make a com...,"[three, our thirties, every two weeks, a day, ...",Three males all in their thirties had a flat i...,"[a Landlord Complain, Beds, I, we, a complaint..."


In [5]:
processed_orders.head()

Unnamed: 0,summary,key_terms,named_entities,categories,noun_chunks
0,The Tribunal orders suppression of the tenants...,"['cat', 'door', 'pane', 'glass', 'vacate']","(2023, 4438983, TENANCY TRIBUNAL -, Barfoot & ...","{'Damage': 0.2073230892419815, 'Pet': 0.309160...","['[2023] NZTT 4438983 \nTENANCY TRIBUNAL', 'Ev..."
1,The Landlord applied for compensation followin...,"['behind', 'property', 'premise', 'left', 'hel...","(2023, 4392339, 4435900, TENANCY TRIBUNAL - TO...","{'Damage': 0.4811885356903076, 'Pet': 0.193092...","['NZTT', '4435900\nTENANCY TRIBUNAL - TOKOROA\..."
2,No application for suppression has been made i...,"['damage', 'carpet', 'crack', 'vanity', 'ha']","(2023, 4417521, 4406666, TENANCY TRIBUNAL - MA...","{'Damage': 0.9675973057746887, 'Pet': 0.227249...","['[2023] NZTT 4417521, 4406666\nTENANCY TRIBUN..."
3,The application was filed on 20 March 2023. It...,"['rehearing', 'wa', 'application', 'allegation...","(2023, 4534511, TENANCY, TRIBUNAL, Anita Lois ...","{'Damage': 0.15224403142929077, 'Pet': 0.22352...",['[2023] NZTT 4534511 \nTENANCY TRIBUNAL - \nA...
4,Nicholas (Nick) Kulavovsky and Natalya Kulakov...,"['basement', 'notice', 'quiet', 'retaliatory',...","(2023, 4626454, TENANCY TRIBUNAL -, Nicholas, ...","{'Damage': 0.08154157549142838, 'Pet': 0.40720...","['[2023] NZTT 4626454 \nTENANCY TRIBUNAL', 'Ev..."


TF-IDF Similarity

In [93]:
# Load the TF-IDF matrix and vectoriser
with open("models\\tfidf\\tfidf_matrix.pkl", "rb") as matrix_file:
    tfidf_matrix = pickle.load(matrix_file)

with open("models\\tfidf\\tfidf_vectorizer.pkl", "rb") as matrix_file:
    tfidf_vec = pickle.load(matrix_file)

In [94]:
# Get scores for all queries loaded
tfidf_scores = []

for idx, row in processed_queries.iterrows():
    query_tfidf_vector = tfidf_vec.transform([row['query']])
    similarities = cosine_similarity(query_tfidf_vector, tfidf_matrix)

    tfidf_scores.append(similarities[0])

Named entity similarity

In [95]:
# Load named entity bag of words
with open("models\\ner_bow\\ner_bow_model.pkl", "rb") as bow_file:
    named_entity_vec = pickle.load(bow_file)

with open("models\\ner_bow\\ner_bow_matrix.pkl", "rb") as matrix_file:
    named_entity_matrix = pickle.load(matrix_file)

In [130]:
# Get scores for all queries loaded
ner_scores_temp = []

for idx, row in processed_queries.iterrows():
    query_ner_vector = named_entity_vec.transform([row['named_entities']])

    similarities = cosine_similarity(named_entity_matrix, query_ner_vector)

    ner_scores_temp.append(similarities)

In [131]:
ner_scores = []

for scores in ner_scores_temp:
    ner_scores.append(list(chain(*scores)))

Noun phrase similarity

In [135]:
# Load named entity bag of words
with open("models\\nph_bow\\nph_bow_model.pkl", "rb") as bow_file:
    noun_phrases_vec = pickle.load(bow_file)

with open("models\\nph_bow\\nph_bow_matrix.pkl", "rb") as matrix_file:
    noun_phrases_matrix = pickle.load(matrix_file)

In [136]:
# Get scores for all queries loaded
nph_scores_temp = []

for idx, row in processed_queries.iterrows():
    query_nph_vector = noun_phrases_vec.transform([row['chunks']])

    similarities = cosine_similarity(noun_phrases_matrix, query_nph_vector)

    nph_scores_temp.append(similarities)

In [137]:
nph_scores = []

for scores in nph_scores_temp:
    nph_scores.append(list(chain(*scores)))

Overall document similarity

In [99]:
# Load the spaCy model
nlp = spacy.load("en_core_web_lg")

ovr_scores = []

for idx, row in processed_queries.iterrows():
    doc1 = nlp(row['summary'])

    query_scores = []

    for idx, row in processed_orders.iterrows():
        doc2 = nlp(row['summary'])

        # Calculate similarity
        similarity_score = doc1.similarity(doc2)

        query_scores.append(similarity_score)

    ovr_scores.append(query_scores)

Create overall scores

In [146]:
import pandas as pd

# Initialize an empty DataFrame with specific columns
columns = ['tfidf_scores','named_entity_scores', 'noun_phrase_scores', 'doc_scores']
similarity_scores = pd.DataFrame(columns={col: [] for col in columns})

similarity_scores['tfidf_scores'] = tfidf_scores
similarity_scores['named_entity_scores'] = ner_scores
similarity_scores['noun_phrase_scores'] = nph_scores
similarity_scores['doc_scores'] = ovr_scores


In [157]:
# Calculate the average similarity score for each document for each query
average_list = [sum(x) / len(x) for x in zip(tfidf_scores, ner_scores, nph_scores, ovr_scores)]

In [208]:
# Convert average scores to indexed dictionary
averages_scores = []

for item in average_list:
    averages_dict = defaultdict(list)
    for idx, score in enumerate(item):
        averages_dict[idx] = score
    
    averages_scores.append(averages_dict)

defaultdict(<class 'list'>, {0: 0.4042127628755586, 1: 0.44531902349295027, 2: 0.4384438380028164, 3: 0.39993470637294626, 4: 0.4031604906992957, 5: 0.3934398061854073, 6: 0.3933938392226194, 7: 0.37528882297625876, 8: 0.37528882297625876, 9: 0.3999447478205628, 10: 0.3962789965879083, 11: 0.4248189858068897, 12: 0.40030366969150116, 13: 0.40030366969150116, 14: 0.36930412644398536, 15: 0.36480160657509986, 16: 0.42642613383239913, 17: 0.4010335830304764, 18: 0.420463128632547, 19: 0.3889240357215349, 20: 0.41169205139472165, 21: 0.4250587777260991, 22: 0.40265870697588263, 23: 0.3981073150103429, 24: 0.4179571714309507, 25: 0.43155041268525696, 26: 0.3405304150102523, 27: 0.32943957939861523, 28: 0.3749881209455881, 29: 0.4079727477633591, 30: 0.3690655754867059, 31: 0.4220223094969383, 32: 0.4002425777101397, 33: 0.4037014423540638, 34: 0.3836518395870482, 35: 0.38011015191049013, 36: 0.37627177911106824, 37: 0.3095816384693444, 38: 0.4001333112832761, 39: 0.4227123282008501, 40: 0.3

In [211]:
# Get the top 3 scoring documents for each query
top_3_list = []

for idx, scores in enumerate(averages_scores):
    sorted_averages = dict(sorted(averages_scores[idx].items(), key=lambda item: item[1], reverse=True))
    top_3 = list(sorted_averages.items())[:3]
    top_3_list.append(top_3)

In [214]:
similarity_scores['top'] = top_3_list

In [215]:
similarity_scores

Unnamed: 0,tfidf_scores,named_entity_scores,noun_phrase_scores,doc_scores,top
0,"[0.029248962160740154, 0.04577266659668733, 0....","[0.040893041005476534, 0.17609172895011482, 0....","[0.6969104081344717, 0.7303693904986397, 0.707...","[0.8497986402015459, 0.8290423079263591, 0.892...","[(41, 0.45026659521615864), (1, 0.445319023492..."
1,"[0.06036613152238166, 0.04082158363261517, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.05890000640537604, 0.0,...","[0.4401997064847175, 0.48741425689320095, 0.44...","[0.7233120358637258, 0.6703294346813675, 0.689...","[(0, 0.30596946846770623), (21, 0.303489780780..."
2,"[0.01664921671556973, 0.07491270284312554, 0.0...","[0.08512565307587487, 0.36656416494266014, 0.2...","[0.6613270734036136, 0.672055977898692, 0.6720...","[0.8634156169541448, 0.8335451605917445, 0.846...","[(1, 0.4867695015690556), (2, 0.44386227247246..."
3,"[0.04105590691115044, 0.0794356137219548, 0.03...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.22951228504677407, 0.2957947357775568, 0.24...","[0.651735087663498, 0.604030634057283, 0.64031...","[(1, 0.24481524588919865), (21, 0.240886597909..."
4,"[0.012794906371000815, 0.04932324559455457, 0....","[0.0, 0.012342857465282813, 0.0972305585328246...","[0.6975133136328191, 0.713315120490142, 0.7140...","[0.8640037469491302, 0.85650880974468, 0.83613...","[(41, 0.4432001904740779), (21, 0.439656504851..."
5,"[0.005549984394422312, 0.13879545220356782, 0....","[0.0, 0.0, 0.0, 0.009968648060386293, 0.005468...","[0.44268345916331375, 0.5175430446879731, 0.45...","[0.8161373388456884, 0.8034966299610722, 0.860...","[(41, 0.37884951097173036), (1, 0.364958781713..."
