In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from scipy.spatial.distance import cosine

## Loading csv data

In [4]:
data_file = "./data/dummy_docs.csv.txt"
processed_database = pd.read_csv(data_file)

In [5]:
processed_database

Unnamed: 0,Title,Content
0,Columbia University,Columbia University also known Columbia offici...
1,Fictional University,Fictional University officially private resear...
2,Third Document,Related private term third dissimilar prepared


## Some quick glances at the data

In [15]:
titles = processed_database["Title"]
contents = processed_database["Content"]

In [11]:
titles

0     Columbia University
1    Fictional University
2          Third Document
Name: Title, dtype: object

In [26]:
contents

0    Columbia University also known Columbia offici...
1    Fictional University officially private resear...
2       Related private term third dissimilar prepared
Name: Content, dtype: object

In [32]:
# ngram_range - lower and upper bounds on ngram lengths used
# performs some preprocessing - e.g. by default makes everything lowercase
count_vectorizer = CountVectorizer(ngram_range=(1,1), stop_words='english')
# more efficient than fit and then transform
counted_data = count_vectorizer.fit_transform(contents)

In [30]:
document_term_matrix=pd.DataFrame(counted_data.toarray(), columns=count_vectorizer.get_feature_names_out(), index=titles)

In [37]:
document_term_matrix

Unnamed: 0_level_0,1754,belong,church,city,college,colleges,colonial,columbia,declaration,dissimilar,...,seven,states,term,trinity,united,universities,university,wave,world,york
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Columbia University,1,1,1,2,1,1,1,5,1,0,...,1,1,0,1,1,1,3,0,1,3
Fictional University,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
Third Document,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [31]:
document_term_matrix.columns

Index(['1754', 'belong', 'church', 'city', 'college', 'colleges', 'colonial',
       'columbia', 'declaration', 'dissimilar', 'education', 'established',
       'fictional', 'fifth', 'founded', 'george', 'grounds', 'higher',
       'independence', 'institution', 'ivy', 'king', 'known', 'league',
       'learning', 'manhattan', 'new', 'officially', 'old', 'oldest',
       'prepared', 'prior', 'private', 'ranked', 'related', 'research',
       'seven', 'states', 'term', 'trinity', 'united', 'universities',
       'university', 'wave', 'world', 'york'],
      dtype='object')

## Computing tf-idf

In [38]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False) 
documents_TFIDF = pd.DataFrame(tfidf_vectorizer.fit_transform(contents).toarray(), index=titles, columns = tfidf_vectorizer.get_feature_names_out())

In [77]:
documents_TFIDF["wave"]

Title
Columbia University     0.000000
Fictional University    0.411062
Third Document          0.000000
Name: wave, dtype: float64

## Example search

In [86]:
def search(query: str, idf: TfidfVectorizer, idfdf: pd.DataFrame, results_returned: int = 10):
    # transform() can accept multiple documents/searches
    # calculating tf-idf representation of the query
    # returns: pairs (index of transformed document
    #                       - in this function, we only search transform a single document,
    #                       the query, so that's always 0 here;
    #                 index of a term present both in the query and in the database/tf-idf dataframe
    #                 )
    # and the tf-idf value corresponding to that pair (document-term)
    query = idf.transform([query])
    # transforming the query to the familiar array format
    # in relation to the just obtained result - we get the values at the positions corresponding
    # to the term indices, and 0s everywhere else
    # [0] used, because we assume that we pose a single query
    query = query.toarray()[0]
    # now the query is represented in the same tf-idf space, as the documents in the databse

    # Getting the most similar documents
    # for each row/document in the database, compute the cosine DISTANCE between it and the query
    # important note: that is equal to 1-cos(angle between the vectors)
    # then, sort the resulting dataframe by those values
    # We want the documents such that the cosine distance between then and the query is low
    # Therefore, we sort the results in ascending orfer, and pick a given number of the least distant ones
    top_matches = idfdf.apply(lambda x: cosine(x, query), axis=1).sort_values(ascending=True)[:results_returned]
    # returning 1-results, so that they represent the similarities batween the documents and the query,
    # NOT the distance
    return 1-top_matches

In [87]:
search("wave university", tfidf_vectorizer, documents_TFIDF)

Title
Fictional University    0.494731
Columbia University     0.114646
Third Document          0.000000
dtype: float64

## Refining the pipeline

In [None]:
def initialise_tfidf_database(path_to_csv: str):
    # load database file
    # the documents are assumed to have been processed already
    documents_dataframe = pd.read_csv(path_to_csv)
    documents_titles = documents_dataframe["Title"]
    documents_contents = documents_dataframe["Content"]

    # prepare tf-idf objects
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=False)
    tfidf_data = tfidf_vectorizer.fit_transform(documents_contents).toarray()
    distinct_tfidf_terms = tfidf_vectorizer.get_feature_names_out()
    documents_dataframe_TFIDF = pd.DataFrame(tfidf_data, index = documents_titles, columns = distinct_tfidf_terms)

    return tfidf_vectorizer, documents_dataframe_TFIDF